Repository: clxia12/RT-DETRv3
Branch: main
Commit: 349e7d99a506
Files: 393
Total size: 3.9 MB

Directory structure:
gitextract_swotw2om/

├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── datasets/
│   │   ├── coco_detection.yml
│   │   ├── coco_instance.yml
│   │   ├── culane.yml
│   │   ├── dota.yml
│   │   ├── dota_ms.yml
│   │   ├── lvis_detection.yml
│   │   ├── mcmot.yml
│   │   ├── mot.yml
│   │   ├── objects365_detection.yml
│   │   ├── roadsign_voc.yml
│   │   ├── sniper_coco_detection.yml
│   │   ├── sniper_visdrone_detection.yml
│   │   ├── spine_coco.yml
│   │   ├── visdrone_detection.yml
│   │   ├── voc.yml
│   │   └── wider_face.yml
│   ├── rtdetrv3/
│   │   ├── _base_/
│   │   │   ├── optimizer_6x.yml
│   │   │   ├── rtdetr_reader.yml
│   │   │   └── rtdetrv3_r50vd.yml
│   │   ├── rtdetrv3_r18vd_6x_coco.yml
│   │   ├── rtdetrv3_r18vd_6x_lvis.yml
│   │   ├── rtdetrv3_r34vd_6x_coco.yml
│   │   ├── rtdetrv3_r50vd_6x_coco.yml
│   │   └── rtdetrv3_r50vd_6x_lvis.yml
│   └── runtime.yml
├── dataset/
│   ├── coco/
│   │   └── download_coco.py
│   ├── dota/
│   │   └── .gitignore
│   ├── mot/
│   │   └── gen_labels_MOT.py
│   ├── roadsign_voc/
│   │   ├── download_roadsign_voc.py
│   │   └── label_list.txt
│   ├── spine_coco/
│   │   └── download_spine_coco.py
│   ├── voc/
│   │   ├── create_list.py
│   │   ├── download_voc.py
│   │   └── label_list.txt
│   └── wider_face/
│       └── download_wider_face.sh
├── ppdet/
│   ├── __init__.py
│   ├── core/
│   │   ├── __init__.py
│   │   ├── config/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   └── yaml_helpers.py
│   │   └── workspace.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── crop_utils/
│   │   │   ├── __init__.py
│   │   │   ├── annotation_cropper.py
│   │   │   └── chip_box_utils.py
│   │   ├── culane_utils.py
│   │   ├── reader.py
│   │   ├── shm_utils.py
│   │   ├── source/
│   │   │   ├── __init__.py
│   │   │   ├── category.py
│   │   │   ├── coco.py
│   │   │   ├── culane.py
│   │   │   ├── dataset.py
│   │   │   ├── keypoint_coco.py
│   │   │   ├── lvis.py
│   │   │   ├── mot.py
│   │   │   ├── pose3d_cmb.py
│   │   │   ├── sniper_coco.py
│   │   │   ├── voc.py
│   │   │   └── widerface.py
│   │   ├── transform/
│   │   │   ├── __init__.py
│   │   │   ├── atss_assigner.py
│   │   │   ├── autoaugment_utils.py
│   │   │   ├── batch_operators.py
│   │   │   ├── culane_operators.py
│   │   │   ├── gridmask_utils.py
│   │   │   ├── keypoint_operators.py
│   │   │   ├── keypoints_3d_operators.py
│   │   │   ├── mot_operators.py
│   │   │   ├── op_helper.py
│   │   │   ├── operators.py
│   │   │   └── rotated_operators.py
│   │   └── utils.py
│   ├── engine/
│   │   ├── __init__.py
│   │   ├── callbacks.py
│   │   ├── env.py
│   │   ├── export_utils.py
│   │   ├── naive_sync_bn.py
│   │   ├── tracker.py
│   │   ├── trainer.py
│   │   ├── trainer_cot.py
│   │   └── trainer_ssod.py
│   ├── ext_op/
│   │   ├── README.md
│   │   ├── csrc/
│   │   │   ├── matched_rbox_iou/
│   │   │   │   ├── matched_rbox_iou.cc
│   │   │   │   └── matched_rbox_iou.cu
│   │   │   ├── nms_rotated/
│   │   │   │   ├── nms_rotated.cc
│   │   │   │   └── nms_rotated.cu
│   │   │   └── rbox_iou/
│   │   │       ├── rbox_iou.cc
│   │   │       ├── rbox_iou.cu
│   │   │       └── rbox_iou_utils.h
│   │   ├── setup.py
│   │   └── unittest/
│   │       ├── test_matched_rbox_iou.py
│   │       └── test_rbox_iou.py
│   ├── metrics/
│   │   ├── __init__.py
│   │   ├── coco_utils.py
│   │   ├── culane_metrics.py
│   │   ├── fast_cocoeval/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── ext/
│   │   │   │   ├── cocoeval.cc
│   │   │   │   ├── cocoeval.h
│   │   │   │   └── setup.py
│   │   │   └── fast_cocoeval.py
│   │   ├── json_results.py
│   │   ├── keypoint_metrics.py
│   │   ├── lvis_utils.py
│   │   ├── map_utils.py
│   │   ├── mcmot_metrics.py
│   │   ├── metrics.py
│   │   ├── mot_metrics.py
│   │   ├── munkres.py
│   │   ├── pose3d_metrics.py
│   │   └── widerface_utils.py
│   ├── model_zoo/
│   │   ├── .gitignore
│   │   ├── __init__.py
│   │   ├── model_zoo.py
│   │   └── tests/
│   │       ├── __init__.py
│   │       ├── test_get_model.py
│   │       └── test_list_model.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── architectures/
│   │   │   ├── __init__.py
│   │   │   ├── blazeface.py
│   │   │   ├── bytetrack.py
│   │   │   ├── cascade_rcnn.py
│   │   │   ├── centernet.py
│   │   │   ├── centertrack.py
│   │   │   ├── clrnet.py
│   │   │   ├── deepsort.py
│   │   │   ├── detr.py
│   │   │   ├── detr_ssod.py
│   │   │   ├── fairmot.py
│   │   │   ├── faster_rcnn.py
│   │   │   ├── fcos.py
│   │   │   ├── gfl.py
│   │   │   ├── jde.py
│   │   │   ├── keypoint_hrhrnet.py
│   │   │   ├── keypoint_hrnet.py
│   │   │   ├── keypoint_petr.py
│   │   │   ├── keypoint_vitpose.py
│   │   │   ├── mask_rcnn.py
│   │   │   ├── meta_arch.py
│   │   │   ├── multi_stream_detector.py
│   │   │   ├── picodet.py
│   │   │   ├── pose3d_metro.py
│   │   │   ├── ppyoloe.py
│   │   │   ├── queryinst.py
│   │   │   ├── retinanet.py
│   │   │   ├── rtdetrv3.py
│   │   │   ├── s2anet.py
│   │   │   ├── solov2.py
│   │   │   ├── sparse_rcnn.py
│   │   │   ├── ssd.py
│   │   │   ├── tood.py
│   │   │   ├── ttfnet.py
│   │   │   ├── yolo.py
│   │   │   ├── yolof.py
│   │   │   └── yolox.py
│   │   ├── assigners/
│   │   │   ├── __init__.py
│   │   │   ├── atss_assigner.py
│   │   │   ├── clrnet_assigner.py
│   │   │   ├── fcosr_assigner.py
│   │   │   ├── hungarian_assigner.py
│   │   │   ├── max_iou_assigner.py
│   │   │   ├── pose_utils.py
│   │   │   ├── rotated_task_aligned_assigner.py
│   │   │   ├── simota_assigner.py
│   │   │   ├── task_aligned_assigner.py
│   │   │   ├── task_aligned_assigner_cr.py
│   │   │   ├── uniform_assigner.py
│   │   │   └── utils.py
│   │   ├── backbones/
│   │   │   ├── __init__.py
│   │   │   ├── blazenet.py
│   │   │   ├── clrnet_resnet.py
│   │   │   ├── convnext.py
│   │   │   ├── csp_darknet.py
│   │   │   ├── cspresnet.py
│   │   │   ├── darknet.py
│   │   │   ├── dla.py
│   │   │   ├── esnet.py
│   │   │   ├── focalnet.py
│   │   │   ├── ghostnet.py
│   │   │   ├── hardnet.py
│   │   │   ├── hgnet_v2.py
│   │   │   ├── hrnet.py
│   │   │   ├── lcnet.py
│   │   │   ├── lite_hrnet.py
│   │   │   ├── mobilenet_v1.py
│   │   │   ├── mobilenet_v3.py
│   │   │   ├── mobileone.py
│   │   │   ├── name_adapter.py
│   │   │   ├── res2net.py
│   │   │   ├── resnet.py
│   │   │   ├── senet.py
│   │   │   ├── shufflenet_v2.py
│   │   │   ├── swin_transformer.py
│   │   │   ├── trans_encoder.py
│   │   │   ├── transformer_utils.py
│   │   │   ├── vgg.py
│   │   │   ├── vision_transformer.py
│   │   │   ├── vit_mae.py
│   │   │   └── vitpose.py
│   │   ├── bbox_utils.py
│   │   ├── clrnet_utils.py
│   │   ├── cls_utils.py
│   │   ├── heads/
│   │   │   ├── __init__.py
│   │   │   ├── bbox_head.py
│   │   │   ├── cascade_head.py
│   │   │   ├── centernet_head.py
│   │   │   ├── centertrack_head.py
│   │   │   ├── clrnet_head.py
│   │   │   ├── detr_head.py
│   │   │   ├── face_head.py
│   │   │   ├── fcos_head.py
│   │   │   ├── fcosr_head.py
│   │   │   ├── gfl_head.py
│   │   │   ├── keypoint_hrhrnet_head.py
│   │   │   ├── mask_head.py
│   │   │   ├── petr_head.py
│   │   │   ├── pico_head.py
│   │   │   ├── ppyoloe_contrast_head.py
│   │   │   ├── ppyoloe_head.py
│   │   │   ├── ppyoloe_ins_head.py
│   │   │   ├── ppyoloe_r_head.py
│   │   │   ├── retina_head.py
│   │   │   ├── roi_extractor.py
│   │   │   ├── s2anet_head.py
│   │   │   ├── simota_head.py
│   │   │   ├── solov2_head.py
│   │   │   ├── sparse_roi_head.py
│   │   │   ├── sparsercnn_head.py
│   │   │   ├── ssd_head.py
│   │   │   ├── tood_head.py
│   │   │   ├── ttf_head.py
│   │   │   ├── vitpose_head.py
│   │   │   ├── yolo_head.py
│   │   │   └── yolof_head.py
│   │   ├── initializer.py
│   │   ├── keypoint_utils.py
│   │   ├── lane_utils.py
│   │   ├── layers.py
│   │   ├── losses/
│   │   │   ├── __init__.py
│   │   │   ├── clrnet_line_iou_loss.py
│   │   │   ├── clrnet_loss.py
│   │   │   ├── cot_loss.py
│   │   │   ├── ctfocal_loss.py
│   │   │   ├── detr_loss.py
│   │   │   ├── fairmot_loss.py
│   │   │   ├── fcos_loss.py
│   │   │   ├── focal_loss.py
│   │   │   ├── gfocal_loss.py
│   │   │   ├── iou_aware_loss.py
│   │   │   ├── iou_loss.py
│   │   │   ├── jde_loss.py
│   │   │   ├── keypoint_loss.py
│   │   │   ├── pose3d_loss.py
│   │   │   ├── probiou_loss.py
│   │   │   ├── queryinst_loss.py
│   │   │   ├── smooth_l1_loss.py
│   │   │   ├── solov2_loss.py
│   │   │   ├── sparsercnn_loss.py
│   │   │   ├── ssd_loss.py
│   │   │   ├── supcontrast.py
│   │   │   ├── varifocal_loss.py
│   │   │   └── yolo_loss.py
│   │   ├── mot/
│   │   │   ├── __init__.py
│   │   │   ├── matching/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── deepsort_matching.py
│   │   │   │   ├── jde_matching.py
│   │   │   │   └── ocsort_matching.py
│   │   │   ├── motion/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── gmc.py
│   │   │   │   ├── kalman_filter.py
│   │   │   │   └── ocsort_kalman_filter.py
│   │   │   ├── tracker/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base_jde_tracker.py
│   │   │   │   ├── base_sde_tracker.py
│   │   │   │   ├── botsort_tracker.py
│   │   │   │   ├── center_tracker.py
│   │   │   │   ├── deepsort_tracker.py
│   │   │   │   ├── jde_tracker.py
│   │   │   │   └── ocsort_tracker.py
│   │   │   ├── utils.py
│   │   │   └── visualization.py
│   │   ├── necks/
│   │   │   ├── __init__.py
│   │   │   ├── bifpn.py
│   │   │   ├── blazeface_fpn.py
│   │   │   ├── centernet_fpn.py
│   │   │   ├── channel_mapper.py
│   │   │   ├── clrnet_fpn.py
│   │   │   ├── csp_pan.py
│   │   │   ├── custom_pan.py
│   │   │   ├── dilated_encoder.py
│   │   │   ├── es_pan.py
│   │   │   ├── fpn.py
│   │   │   ├── hrfpn.py
│   │   │   ├── lc_pan.py
│   │   │   ├── ttf_fpn.py
│   │   │   └── yolo_fpn.py
│   │   ├── ops.py
│   │   ├── post_process.py
│   │   ├── proposal_generator/
│   │   │   ├── __init__.py
│   │   │   ├── anchor_generator.py
│   │   │   ├── embedding_rpn_head.py
│   │   │   ├── proposal_generator.py
│   │   │   ├── rpn_head.py
│   │   │   ├── target.py
│   │   │   └── target_layer.py
│   │   ├── rbox_utils.py
│   │   ├── reid/
│   │   │   ├── __init__.py
│   │   │   ├── fairmot_embedding_head.py
│   │   │   ├── jde_embedding_head.py
│   │   │   ├── pplcnet_embedding.py
│   │   │   ├── pyramidal_embedding.py
│   │   │   ├── resnet.py
│   │   │   └── resnet_embedding.py
│   │   ├── shape_spec.py
│   │   ├── ssod/
│   │   │   ├── __init__.py
│   │   │   ├── losses.py
│   │   │   └── utils.py
│   │   ├── tests/
│   │   │   ├── __init__.py
│   │   │   ├── test_architectures.py
│   │   │   ├── test_base.py
│   │   │   ├── test_mstest.py
│   │   │   ├── test_ops.py
│   │   │   └── test_yolov3_loss.py
│   │   └── transformers/
│   │       ├── __init__.py
│   │       ├── deformable_transformer.py
│   │       ├── detr_transformer.py
│   │       ├── dino_transformer.py
│   │       ├── ext_op/
│   │       │   ├── README.md
│   │       │   ├── ms_deformable_attn_op.cc
│   │       │   ├── ms_deformable_attn_op.cu
│   │       │   ├── setup_ms_deformable_attn_op.py
│   │       │   └── test_ms_deformable_attn_op.py
│   │       ├── group_detr_transformer.py
│   │       ├── hybrid_encoder.py
│   │       ├── mask_dino_transformer.py
│   │       ├── mask_rtdetr_transformer.py
│   │       ├── matchers.py
│   │       ├── petr_transformer.py
│   │       ├── position_encoding.py
│   │       ├── rtdetr_transformer.py
│   │       ├── rtdetr_transformerv2.py
│   │       ├── rtdetr_transformerv3.py
│   │       └── utils.py
│   ├── optimizer/
│   │   ├── __init__.py
│   │   ├── adamw.py
│   │   ├── ema.py
│   │   ├── optimizer.py
│   │   └── utils.py
│   ├── slim/
│   │   ├── __init__.py
│   │   ├── distill_loss.py
│   │   ├── distill_model.py
│   │   ├── ofa.py
│   │   ├── prune.py
│   │   ├── quant.py
│   │   └── unstructured_prune.py
│   └── utils/
│       ├── __init__.py
│       ├── cam_utils.py
│       ├── check.py
│       ├── checkpoint.py
│       ├── cli.py
│       ├── colormap.py
│       ├── compact.py
│       ├── download.py
│       ├── fuse_utils.py
│       ├── logger.py
│       ├── profiler.py
│       ├── stats.py
│       ├── visualizer.py
│       └── voc_utils.py
├── requirements.txt
├── scripts/
│   ├── build_wheel.sh
│   ├── eval.sh
│   ├── kill.sh
│   └── train.sh
└── tools/
    ├── anchor_cluster.py
    ├── box_distribution.py
    ├── cam_ppdet.py
    ├── eval.py
    ├── eval_mot.py
    ├── export_model.py
    ├── gen_semi_coco.py
    ├── infer.py
    ├── infer_culane.py
    ├── infer_mot.py
    ├── post_quant.py
    ├── slice_image.py
    ├── sniper_params_stats.py
    ├── train.py
    └── x2coco.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
English | [简体中文](README_cn.md)

## RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision

:fire::fire:**[WACV 2025 Oral]** The official implementation of the paper "[RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision](https://arxiv.org/pdf/2409.08475)". \
[[`arXiv`](https://arxiv.org/pdf/2409.08475)] 
![image](https://github.com/user-attachments/assets/5910d729-cc44-49f4-b404-b6631576930f)


## Model Zoo on COCO

| Model | Epoch | Backbone  | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Weight | Config | Log
|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
| RT-DETRv3-R18 | 6x |  ResNet-18 | 640 | 48.1 | 66.2 | 20 | 60 | 217 |[baidu 网盘](https://pan.baidu.com/s/1s7lyT6_fHmczoegQZXdX-w?pwd=54jp)  [google drive](https://drive.google.com/file/d/1zIDOjn1qDccC3TBsDlGQHOjVrehd26bk/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml) | 
| RT-DETRv3-R34 | 6x |  ResNet-34 | 640 | 49.9 | 67.7 | 31 | 92 | 161 | [baidu 网盘](https://pan.baidu.com/s/1VCg6oqNVF9_ZZdmlhUBgSA?pwd=pi32) [google drive](https://drive.google.com/file/d/12-wqAF8i67eqbocaWPK33d4tFkN2wGi2/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml) | 
| RT-DETRv3-R50 | 6x |  ResNet-50 | 640 | 53.4 | 71.7 | 42 | 136 | 108 | [baidu 网盘](https://pan.baidu.com/s/1DuvrpMIqbU5okoDp16C94g?pwd=wrxy) [google drive](https://drive.google.com/file/d/1wfJE-QgdgqKE0IkiTuoD5HEbZwwZg3sQ/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml) | 
| RT-DETRv3-R101 | 6x |  ResNet-101 | 640 | 54.6 | 73.1 | 76 | 259 | 74 |  | [config](./configs/rtdetrv3/rtdetrv3_r101vd_6x_coco.yml) | 


**Notes:**
- RT-DETRv3 uses 4 GPUs for training.
- RT-DETRv3 was trained on COCO train2017 and evaluated on val2017.

## Model Zoo on LVIS

| Model | Epoch | Backbone  | Input shape | AP | $AP_{r}$ | $AP_{c}$ | $AP_{f}$ | Weight | Config | Log
|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
| RT-DETRv3-R18 | 6x |  ResNet-18 | 640 | 26.5 | 12.5 | 24.3 | 35.2 |  | [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml) | 
| RT-DETRv3-R50 | 6x |  ResNet-50 | 640 | 33.9 | 20.2 | 32.5 | 41.5 |  | [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml) |


## Quick start

<details open>
<summary>Install requirements</summary>

<!-- - PaddlePaddle == 2.4.2 -->
```bash
pip install -r requirements.txt
```

</details>

<details>
<summary>Compile (optional)</summary>

```bash
cd ./ppdet/modeling/transformers/ext_op/

python setup_ms_deformable_attn_op.py install
```
See [details](./ppdet/modeling/transformers/ext_op/)
</details>


<details>
<summary>Data preparation</summary>

- Download and extract COCO 2017 train and val images.
```
path/to/coco/
  annotations/  # annotation json files
  train2017/    # train images
  val2017/      # val images
```
- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)
</details>


<details>
<summary>Training & Evaluation & Testing</summary>

- Training on a Single GPU:

```shell
# training on single-GPU
export CUDA_VISIBLE_DEVICES=0
python tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --eval
```

- Training on Multiple GPUs:

```shell
# training on multi-GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --fleet --eval
```

- Evaluation:

```shell
python tools/eval.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams
```

- Inference:

```shell
python tools/infer.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams \
              --infer_img=./demo/000000570688.jpg
```

</details>


## Deploy

<details open>
<summary>1. Export model </summary>

```shell
python tools/export_model.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams trt=True \
              --output_dir=output_inference
```

</details>

<details>
<summary>2. Convert to ONNX </summary>

- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX

```shell
pip install onnx==1.13.0
pip install paddle2onnx==1.0.5
```

- Convert:

```shell
paddle2onnx --model_dir=./output_inference/rtdetrv3_r18vd_6x_coco/ \
            --model_filename model.pdmodel  \
            --params_filename model.pdiparams \
            --opset_version 16 \
            --save_file rtdetrv3_r18vd_6x_coco.onnx
```
</details>

<details>
<summary>3. Convert to TensorRT </summary>

- TensorRT version >= 8.5.1
- Inference can refer to [Bennchmark](../benchmark)

```shell
trtexec --onnx=./rtdetrv3_r18vd_6x_coco.onnx \
        --workspace=4096 \
        --shapes=image:1x3x640x640 \
        --saveEngine=rtdetrv3_r18vd_6x_coco.trt \
        --avgRuns=100 \
        --fp16
```
-
</details>

## Citation

If you find RT-DETRv3 useful in your research, please consider giving a star ⭐ and citing:

```
@article{wang2024rt,
  title={RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision},
  author={Wang, Shuo and Xia, Chunlong and Lv, Feng and Shi, Yifeng},
  journal={arXiv preprint arXiv:2409.08475},
  year={2024}
}
```


================================================
FILE: configs/datasets/coco_detection.yml
================================================
metric: COCO
num_classes: 80

TrainDataset:
  name: COCODataSet
  image_dir: train2017
  anno_path: annotations/instances_train2017.json
  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']

EvalDataset:
  name: COCODataSet
  image_dir: val2017
  anno_path: annotations/instances_val2017.json
  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
  allow_empty: true

TestDataset:
  name: ImageFolder
  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'


================================================
FILE: configs/datasets/coco_instance.yml
================================================
metric: COCO
num_classes: 80

TrainDataset:
  name: COCODataSet
  image_dir: train2017
  anno_path: annotations/instances_train2017.json
  dataset_dir: dataset/coco
  data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_poly', 'is_crowd']

EvalDataset:
  name: COCODataSet
  image_dir: val2017
  anno_path: annotations/instances_val2017.json
  dataset_dir: dataset/coco

TestDataset:
  name: ImageFolder
  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'


================================================
FILE: configs/datasets/culane.yml
================================================
metric: CULaneMetric
num_classes: 5 # 4 lanes + background

cut_height: &cut_height 270
dataset_dir: &dataset_dir dataset/culane

TrainDataset:
  name: CULaneDataSet
  dataset_dir: *dataset_dir
  list_path: 'list/train_gt.txt'
  split: train
  cut_height: *cut_height


EvalDataset:
  name: CULaneDataSet
  dataset_dir: *dataset_dir
  list_path: 'list/test.txt'
  split: test
  cut_height: *cut_height


TestDataset:
  name: CULaneDataSet
  dataset_dir: *dataset_dir
  list_path: 'list/test.txt'
  split: test
  cut_height: *cut_height


================================================
FILE: configs/datasets/dota.yml
================================================
metric: RBOX
num_classes: 15

TrainDataset:
  !COCODataSet
    image_dir: trainval1024/images
    anno_path: trainval1024/DOTA_trainval1024.json
    dataset_dir: dataset/dota/
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

EvalDataset:
  !COCODataSet
    image_dir: trainval1024/images
    anno_path: trainval1024/DOTA_trainval1024.json
    dataset_dir: dataset/dota/
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

TestDataset:
  !ImageFolder
    anno_path: test1024/DOTA_test1024.json
    dataset_dir: dataset/dota/


================================================
FILE: configs/datasets/dota_ms.yml
================================================
metric: RBOX
num_classes: 15

TrainDataset:
  !COCODataSet
    image_dir: trainval1024/images
    anno_path: trainval1024/DOTA_trainval1024.json
    dataset_dir: dataset/dota_ms/
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

EvalDataset:
  !COCODataSet
    image_dir: trainval1024/images
    anno_path: trainval1024/DOTA_trainval1024.json
    dataset_dir: dataset/dota_ms/
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

TestDataset:
  !ImageFolder
    anno_path: test1024/DOTA_test1024.json
    dataset_dir: dataset/dota_ms/


================================================
FILE: configs/datasets/lvis_detection.yml
================================================
metric: LVIS
num_classes: 1203

TrainDataset:
  name: LVISDataSet
  image_dir: .
  anno_path: annotations/lvis_v1_train.json
  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']

EvalDataset:
  name: LVISDataSet
  image_dir: .
  anno_path: annotations/lvis_v1_val.json
  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
  allow_empty: true

TestDataset:
  name: ImageFolder
  anno_path: annotations/lvis_v1_val.json # also support txt (like VOC's label_list.txt)
  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO # if set, anno_path will be 'dataset_dir/anno_path'

================================================
FILE: configs/datasets/mcmot.yml
================================================
metric: MCMOT
num_classes: 10
# using VisDrone2019 MOT dataset with 10 classes as default, you can modify it for your needs.

# for MCMOT training
TrainDataset:
  !MCMOTDataSet
    dataset_dir: dataset/mot
    image_lists: ['visdrone_mcmot.train']
    data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']
    label_list: label_list.txt

# for MCMOT evaluation
# If you want to change the MCMOT evaluation dataset, please modify 'data_root'
EvalMOTDataset:
  !MOTImageFolder
    dataset_dir: dataset/mot
    data_root: visdrone_mcmot/images/val
    keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT

# for MCMOT video inference
TestMOTDataset:
  !MOTImageFolder
    dataset_dir: dataset/mot
    keep_ori_im: True # set True if save visualization images or video


================================================
FILE: configs/datasets/mot.yml
================================================
metric: MOT
num_classes: 1

# for MOT training
TrainDataset:
  !MOTDataSet
    dataset_dir: dataset/mot
    image_lists: ['mot17.train', 'caltech.all', 'cuhksysu.train', 'prw.train', 'citypersons.train', 'eth.train']
    data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']

# for MOT evaluation
# If you want to change the MOT evaluation dataset, please modify 'data_root'
EvalMOTDataset:
  !MOTImageFolder
    dataset_dir: dataset/mot
    data_root: MOT16/images/train
    keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT

# for MOT video inference
TestMOTDataset:
  !MOTImageFolder
    dataset_dir: dataset/mot
    keep_ori_im: True # set True if save visualization images or video


================================================
FILE: configs/datasets/objects365_detection.yml
================================================
metric: COCO
num_classes: 365

TrainDataset:
  !COCODataSet
    image_dir: train
    anno_path: annotations/zhiyuan_objv2_train.json
    dataset_dir: dataset/objects365
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']

EvalDataset:
  !COCODataSet
    image_dir: val
    anno_path: annotations/zhiyuan_objv2_val.json
    dataset_dir: dataset/objects365
    allow_empty: true

TestDataset:
  !ImageFolder
    anno_path: annotations/zhiyuan_objv2_val.json
    dataset_dir: dataset/objects365/


================================================
FILE: configs/datasets/roadsign_voc.yml
================================================
metric: VOC
map_type: integral
num_classes: 4

TrainDataset:
  name: VOCDataSet
  dataset_dir: dataset/roadsign_voc
  anno_path: train.txt
  label_list: label_list.txt
  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']

EvalDataset:
  name: VOCDataSet
  dataset_dir: dataset/roadsign_voc
  anno_path: valid.txt
  label_list: label_list.txt
  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']

TestDataset:
  name: ImageFolder
  anno_path: dataset/roadsign_voc/label_list.txt


================================================
FILE: configs/datasets/sniper_coco_detection.yml
================================================
metric: SNIPERCOCO
num_classes: 80

TrainDataset:
  !SniperCOCODataSet
    image_dir: train2017
    anno_path: annotations/instances_train2017.json
    dataset_dir: dataset/coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
    allow_empty: true
    is_trainset: true
    image_target_sizes: [2000, 1000]
    valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]
    chip_target_size: 512
    chip_target_stride: 200
    use_neg_chip: false
    max_neg_num_per_im: 8


EvalDataset:
  !SniperCOCODataSet
    image_dir: val2017
    anno_path: annotations/instances_val2017.json
    dataset_dir: dataset/coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
    allow_empty: true
    is_trainset: false
    image_target_sizes: [2000, 1000]
    valid_box_ratio_ranges: [[-1, 0.1], [0.08, -1]]
    chip_target_size: 512
    chip_target_stride: 200
    max_per_img: -1
    nms_thresh: 0.5

TestDataset:
  !SniperCOCODataSet
    image_dir: val2017
    dataset_dir: dataset/coco
    is_trainset: false
    image_target_sizes: [2000, 1000]
    valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]
    chip_target_size: 500
    chip_target_stride: 200
    max_per_img: -1
    nms_thresh: 0.5


================================================
FILE: configs/datasets/sniper_visdrone_detection.yml
================================================
metric: SNIPERCOCO
num_classes: 9

TrainDataset:
  !SniperCOCODataSet
    image_dir: train
    anno_path: annotations/train.json
    dataset_dir: dataset/VisDrone2019_coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
    allow_empty: true
    is_trainset: true
    image_target_sizes: [8145, 2742]
    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
    chip_target_size: 1536
    chip_target_stride: 1184
    use_neg_chip: false
    max_neg_num_per_im: 8


EvalDataset:
  !SniperCOCODataSet
    image_dir: val
    anno_path: annotations/val.json
    dataset_dir: dataset/VisDrone2019_coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
    allow_empty: true
    is_trainset: false
    image_target_sizes: [8145, 2742]
    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
    chip_target_size: 1536
    chip_target_stride: 1184
    max_per_img: -1
    nms_thresh: 0.5

TestDataset:
  !SniperCOCODataSet
    image_dir: val
    dataset_dir: dataset/VisDrone2019_coco
    is_trainset: false
    image_target_sizes: [8145, 2742]
    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
    chip_target_size: 1536
    chip_target_stride: 1184
    max_per_img: -1
    nms_thresh: 0.5


================================================
FILE: configs/datasets/spine_coco.yml
================================================
metric: RBOX
num_classes: 9

TrainDataset:
  !COCODataSet
    image_dir: images
    anno_path: annotations/train.json
    dataset_dir: dataset/spine_coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

EvalDataset:
  !COCODataSet
    image_dir: images
    anno_path: annotations/valid.json
    dataset_dir: dataset/spine_coco
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']

TestDataset:
  !ImageFolder
    anno_path: annotations/valid.json
    dataset_dir: dataset/spine_coco


================================================
FILE: configs/datasets/visdrone_detection.yml
================================================
metric: COCO
num_classes: 10

TrainDataset:
  !COCODataSet
    image_dir: VisDrone2019-DET-train
    anno_path: train.json
    dataset_dir: dataset/visdrone
    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']

EvalDataset:
  !COCODataSet
    image_dir: VisDrone2019-DET-val
    anno_path: val.json
    # image_dir: test_dev
    # anno_path: test_dev.json
    dataset_dir: dataset/visdrone

TestDataset:
  !ImageFolder
    anno_path: val.json
    dataset_dir: dataset/visdrone


================================================
FILE: configs/datasets/voc.yml
================================================
metric: VOC
map_type: 11point
num_classes: 20

TrainDataset:
  name: VOCDataSet
  dataset_dir: dataset/voc
  anno_path: trainval.txt
  label_list: label_list.txt
  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']

EvalDataset:
  name: VOCDataSet
  dataset_dir: dataset/voc
  anno_path: test.txt
  label_list: label_list.txt
  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']

TestDataset:
  name: ImageFolder
  anno_path: dataset/voc/label_list.txt


================================================
FILE: configs/datasets/wider_face.yml
================================================
metric: WiderFace
num_classes: 1

TrainDataset:
  !WIDERFaceDataSet
    dataset_dir: dataset/wider_face
    anno_path: wider_face_split/wider_face_train_bbx_gt.txt
    image_dir: WIDER_train/images
    data_fields: ['image', 'gt_bbox', 'gt_class']

EvalDataset:
  !WIDERFaceValDataset
    dataset_dir: dataset/wider_face
    image_dir: WIDER_val/images
    anno_path: wider_face_split/wider_face_val_bbx_gt.txt
    gt_mat_path: WIDER_val/ground_truth
    data_fields: ['image', 'gt_bbox', 'gt_class', 'ori_gt_bbox']

TestDataset:
  !ImageFolder
    use_default_label: true


================================================
FILE: configs/rtdetrv3/_base_/optimizer_6x.yml
================================================
epoch: 72

LearningRate:
  base_lr: 0.0004
  schedulers:
  - !PiecewiseDecay
    gamma: 1.0
    milestones: [100]
    use_warmup: true
  - !LinearWarmup
    start_factor: 0.001
    steps: 2000

OptimizerBuilder:
  clip_grad_by_norm: 0.1
  regularizer: false
  optimizer:
    type: AdamW
    weight_decay: 0.0001


================================================
FILE: configs/rtdetrv3/_base_/rtdetr_reader.yml
================================================
worker_num: 4
TrainReader:
  sample_transforms:
    - Decode: {}
    - RandomDistort: {prob: 0.8}
    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
    - RandomCrop: {prob: 0.8}
    - RandomFlip: {}
  batch_transforms:
    - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False}
    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
    - NormalizeBox: {retain_origin_box: true}
    - BboxXYXY2XYWH: {}
    - Permute: {}
    - PadGT: {only_origin_box: true}
  batch_size: 16
  shuffle: true
  drop_last: true
  collate_batch: false
  use_shared_memory: true


EvalReader:
  sample_transforms:
    - Decode: {}
    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
    - Permute: {}
  batch_size: 16
  shuffle: false
  drop_last: false


TestReader:
  inputs_def:
    image_shape: [3, 640, 640]
  sample_transforms:
    - Decode: {}
    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
    - Permute: {}
  batch_size: 1
  shuffle: false
  drop_last: false


================================================
FILE: configs/rtdetrv3/_base_/rtdetrv3_r50vd.yml
================================================
architecture: RTDETRV3
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
norm_type: sync_bn
use_ema: True
ema_decay: 0.9999
ema_decay_type: "exponential"
ema_filter_no_grad: True
hidden_dim: 256
use_focal_loss: True
eval_size: [640, 640]


RTDETRV3:
  backbone: ResNet
  neck: HybridEncoder
  transformer: RTDETRTransformerv3
  detr_head: DINOv3Head
  aux_o2m_head: PPYOLOEHead
  post_process: DETRPostProcess

ResNet:
  # index 0 stands for res2
  depth: 50
  variant: d
  norm_type: bn
  freeze_at: 0
  return_idx: [1, 2, 3]
  lr_mult_list: [0.1, 0.1, 0.1, 0.1]
  num_stages: 4
  freeze_stem_only: True

HybridEncoder:
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  encoder_layer:
    name: TransformerLayer
    d_model: 256
    nhead: 8
    dim_feedforward: 1024
    dropout: 0.
    activation: 'gelu'
  expansion: 1.0


RTDETRTransformerv3:
  num_queries: 300
  position_embed_type: sine
  feat_strides: [8, 16, 32]
  num_levels: 3
  nhead: 8
  num_decoder_layers: 6
  dim_feedforward: 1024
  dropout: 0.0
  activation: relu
  num_denoising: 100
  label_noise_ratio: 0.5
  box_noise_scale: 1.0
  learnt_init_query: False
  num_noises: 0
  num_noise_queries: []
  num_noise_denoising: 100
  learnt_init_query: False


DINOv3Head:
  o2m: 4
  loss:
    name: DINOv3Loss
    loss_coeff: {class: 1, bbox: 5, giou: 2}
    aux_loss: True
    use_vfl: True
    matcher:
      name: HungarianMatcher
      matcher_coeff: {class: 2, bbox: 5, giou: 2}

PPYOLOEHead:
  fpn_strides: [8, 16, 32]
  grid_cell_scale: 5.0
  grid_cell_offset: 0.5
  static_assigner_epoch: 30
  use_varifocal_loss: True
  loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}
  static_assigner:
    name: ATSSAssigner
    topk: 9
  assigner:
    name: TaskAlignedAssigner
    topk: 13
    alpha: 1.0
    beta: 6.0
  nms:
    name: MultiClassNMS
    nms_top_k: 1000
    keep_top_k: 300
    score_threshold: 0.01
    nms_threshold: 0.7

DETRPostProcess:
  num_top_queries: 300


================================================
FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml
================================================
_BASE_: [
  '../datasets/coco_detection.yml',
  '../runtime.yml',
  '_base_/optimizer_6x.yml',
  '_base_/rtdetrv3_r50vd.yml',
  '_base_/rtdetr_reader.yml',
]

weights: output/rtdetrv3_r18vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200

o2m_branch: True
num_queries_o2m: 450

pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams

RTDETRV3:
  backbone: ResNet
  neck: HybridEncoder
  transformer: RTDETRTransformerv3
  detr_head: DINOv3Head
  aux_o2m_head: PPYOLOEHead
  post_process: DETRPostProcess
  
ResNet:
  depth: 18
  variant: d
  return_idx: [1, 2, 3]
  freeze_at: -1
  freeze_norm: false
  norm_decay: 0.

HybridEncoder:
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  encoder_layer:
    name: TransformerLayer
    d_model: 256
    nhead: 8
    dim_feedforward: 1024
    dropout: 0.
    activation: 'gelu'
  expansion: 0.5
  depth_mult: 1.0

RTDETRTransformerv3:
  eval_idx: -1
  num_decoder_layers: 3
  num_noises: 3
  num_noise_queries: [300, 300, 300]
  num_noise_denoising: 100
  learnt_init_query: False


================================================
FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml
================================================
_BASE_: [
  '../datasets/lvis_detection.yml',
  '../runtime.yml',
  '_base_/optimizer_6x.yml',
  '_base_/rtdetrv3_r50vd.yml',
  '_base_/rtdetr_reader.yml',
]

weights: output/rtdetrv3vd_r18_6x_lvis/model_final
find_unused_parameters: True
log_iter: 200

o2m_branch: True
num_queries_o2m: 450

pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams

RTDETRV3:
  backbone: ResNet
  neck: HybridEncoder
  transformer: RTDETRTransformerv3
  detr_head: DINOv3Head
  aux_o2m_head: PPYOLOEHead
  post_process: DETRPostProcess
  
ResNet:
  depth: 18
  variant: d
  return_idx: [1, 2, 3]
  freeze_at: -1
  freeze_norm: false
  norm_decay: 0.

HybridEncoder:
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  encoder_layer:
    name: TransformerLayer
    d_model: 256
    nhead: 8
    dim_feedforward: 1024
    dropout: 0.
    activation: 'gelu'
  expansion: 0.5
  depth_mult: 1.0

RTDETRTransformerv3:
  eval_idx: -1
  num_decoder_layers: 3
  num_noises: 2
  num_noise_queries: [300, 300]
  num_noise_denoising: 100
  learnt_init_query: False


================================================
FILE: configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml
================================================
_BASE_: [
  '../datasets/coco_detection.yml',
  '../runtime.yml',
  '_base_/optimizer_6x.yml',
  '_base_/rtdetrv3_r50vd.yml',
  '_base_/rtdetr_reader.yml',
]

weights: output/rtdetrv3_r34vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200

o2m_branch: True
num_queries_o2m: 450

pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams

RTDETRV3:
  backbone: ResNet
  neck: HybridEncoder
  transformer: RTDETRTransformerv3
  detr_head: DINOv3Head
  aux_o2m_head: PPYOLOEHead
  post_process: DETRPostProcess
  
ResNet:
  depth: 34
  variant: d
  return_idx: [1, 2, 3]
  freeze_at: -1
  freeze_norm: false
  norm_decay: 0.

HybridEncoder:
  hidden_dim: 256
  use_encoder_idx: [2]
  num_encoder_layers: 1
  encoder_layer:
    name: TransformerLayer
    d_model: 256
    nhead: 8
    dim_feedforward: 1024
    dropout: 0.
    activation: 'gelu'
  expansion: 0.5
  depth_mult: 1.0

RTDETRTransformerv3:
  eval_idx: -1
  num_decoder_layers: 4
  num_noises: 3
  num_noise_queries: [300, 300, 300]
  num_noise_denoising: 100
  learnt_init_query: False


================================================
FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml
================================================
_BASE_: [
  '../datasets/coco_detection.yml',
  '../runtime.yml',
  '_base_/optimizer_6x.yml',
  '_base_/rtdetrv3_r50vd.yml',
  '_base_/rtdetr_reader.yml',
]

weights: output/rtdetrv3_r50vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200

o2m_branch: True
num_queries_o2m: 450

pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams


RTDETRTransformerv3:
  eval_idx: -1
  num_decoder_layers: 6
  num_noises: 2
  num_noise_queries: [300, 300]
  num_noise_denoising: 100
  learnt_init_query: False


================================================
FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml
================================================
_BASE_: [
  '../datasets/lvis_detection.yml',
  '../runtime.yml',
  '_base_/optimizer_6x.yml',
  '_base_/rtdetrv3_r50vd.yml',
  '_base_/rtdetr_reader.yml',
]

weights: output/rtdetrv3_r50vd_6x_lvis/model_final
find_unused_parameters: True
log_iter: 200
snapshot_epoch: 2

o2m_branch: True
num_queries_o2m: 450

pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams


RTDETRTransformerv3:
  eval_idx: -1
  num_decoder_layers: 6
  num_noises: 1
  num_noise_queries: [300]
  num_noise_denoising: 100
  learnt_init_query: False


================================================
FILE: configs/runtime.yml
================================================
use_gpu: true
use_xpu: false
use_mlu: false
use_npu: false
log_iter: 20
save_dir: output
snapshot_epoch: 1
print_flops: false
print_params: false

# Exporting the model
export:
  post_process: True  # Whether post-processing is included in the network when export model.
  nms: True           # Whether NMS is included in the network when export model.
  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
  fuse_conv_bn: False


================================================
FILE: dataset/coco/download_coco.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'coco')


================================================
FILE: dataset/dota/.gitignore
================================================


================================================
FILE: dataset/mot/gen_labels_MOT.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os.path as osp
import os
import numpy as np

MOT_data = 'MOT16'

# choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20']
# or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md')


def mkdirs(d):
    if not osp.exists(d):
        os.makedirs(d)


seq_root = './{}/images/train'.format(MOT_data)
label_root = './{}/labels_with_ids/train'.format(MOT_data)
mkdirs(label_root)
seqs = [s for s in os.listdir(seq_root)]

tid_curr = 0
tid_last = -1
for seq in seqs:
    seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
    seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find(
        '\nimHeight')])
    seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find(
        '\nimExt')])

    gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
    gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')

    seq_label_root = osp.join(label_root, seq, 'img1')
    mkdirs(seq_label_root)

    for fid, tid, x, y, w, h, mark, label, _ in gt:
        if mark == 0 or not label == 1:
            continue
        fid = int(fid)
        tid = int(tid)
        if not tid == tid_last:
            tid_curr += 1
            tid_last = tid
        x += w / 2
        y += h / 2
        label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
        label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
            tid_curr, x / seq_width, y / seq_height, w / seq_width,
            h / seq_height)
        with open(label_fpath, 'a') as f:
            f.write(label_str)


================================================
FILE: dataset/roadsign_voc/download_roadsign_voc.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'roadsign_voc')


================================================
FILE: dataset/roadsign_voc/label_list.txt
================================================
speedlimit
crosswalk
trafficlight
stop

================================================
FILE: dataset/spine_coco/download_spine_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'spine_coco')


================================================
FILE: dataset/voc/create_list.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.utils.download import create_voc_list

logging.basicConfig(level=logging.INFO)

voc_path = osp.split(osp.realpath(sys.argv[0]))[0]
create_voc_list(voc_path)


================================================
FILE: dataset/voc/download_voc.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'voc')


================================================
FILE: dataset/voc/label_list.txt
================================================
aeroplane
bicycle
bird
boat
bottle
bus
car
cat
chair
cow
diningtable
dog
horse
motorbike
person
pottedplant
sheep
sofa
train
tvmonitor


================================================
FILE: dataset/wider_face/download_wider_face.sh
================================================
# All rights `PaddleDetection` reserved
# References:
#   @inproceedings{yang2016wider,
#   Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou},
#   Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
#   Title = {WIDER FACE: A Face Detection Benchmark},
#   Year = {2016}}

DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"

# Download the data.
echo "Downloading..."
wget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip
wget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip
wget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip
# Extract the data.
echo "Extracting..."
unzip -q WIDER_train.zip
unzip -q WIDER_val.zip
unzip -q wider_face_split.zip


================================================
FILE: ppdet/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import (core, data, engine, modeling, model_zoo, optimizer, metrics,
               utils, slim)


try:
    from .version import full_version as __version__
    from .version import commit as __git_commit__
except ImportError:
    import sys
    sys.stderr.write("Warning: import ppdet from source directory " \
            "without installing, run 'python setup.py install' to " \
            "install ppdet firstly\n")


================================================
FILE: ppdet/core/__init__.py
================================================
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import config


================================================
FILE: ppdet/core/config/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppdet/core/config/schema.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import inspect
import importlib
import re

try:
    from docstring_parser import parse as doc_parse
except Exception:

    def doc_parse(*args):
        pass


try:
    from typeguard import check_type
except Exception:

    def check_type(*args):
        pass


__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']


class SchemaValue(object):
    def __init__(self, name, doc='', type=None):
        super(SchemaValue, self).__init__()
        self.name = name
        self.doc = doc
        self.type = type

    def set_default(self, value):
        self.default = value

    def has_default(self):
        return hasattr(self, 'default')


class SchemaDict(dict):
    def __init__(self, **kwargs):
        super(SchemaDict, self).__init__()
        self.schema = {}
        self.strict = False
        self.doc = ""
        self.update(kwargs)

    def __setitem__(self, key, value):
        # XXX also update regular dict to SchemaDict??
        if isinstance(value, dict) and key in self and isinstance(self[key],
                                                                  SchemaDict):
            self[key].update(value)
        else:
            super(SchemaDict, self).__setitem__(key, value)

    def __missing__(self, key):
        if self.has_default(key):
            return self.schema[key].default
        elif key in self.schema:
            return self.schema[key]
        else:
            raise KeyError(key)

    def copy(self):
        newone = SchemaDict()
        newone.__dict__.update(self.__dict__)
        newone.update(self)
        return newone

    def set_schema(self, key, value):
        assert isinstance(value, SchemaValue)
        self.schema[key] = value

    def set_strict(self, strict):
        self.strict = strict

    def has_default(self, key):
        return key in self.schema and self.schema[key].has_default()

    def is_default(self, key):
        if not self.has_default(key):
            return False
        if hasattr(self[key], '__dict__'):
            return True
        else:
            return key not in self or self[key] == self.schema[key].default

    def find_default_keys(self):
        return [
            k for k in list(self.keys()) + list(self.schema.keys())
            if self.is_default(k)
        ]

    def mandatory(self):
        return any([k for k in self.schema.keys() if not self.has_default(k)])

    def find_missing_keys(self):
        missing = [
            k for k in self.schema.keys()
            if k not in self and not self.has_default(k)
        ]
        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
        return missing + placeholders

    def find_extra_keys(self):
        return list(set(self.keys()) - set(self.schema.keys()))

    def find_mismatch_keys(self):
        mismatch_keys = []
        for arg in self.schema.values():
            if arg.type is not None:
                try:
                    check_type("{}.{}".format(self.name, arg.name),
                               self[arg.name], arg.type)
                except Exception:
                    mismatch_keys.append(arg.name)
        return mismatch_keys

    def validate(self):
        missing_keys = self.find_missing_keys()
        if missing_keys:
            raise ValueError("Missing param for class<{}>: {}".format(
                self.name, ", ".join(missing_keys)))
        extra_keys = self.find_extra_keys()
        if extra_keys and self.strict:
            raise ValueError("Extraneous param for class<{}>: {}".format(
                self.name, ", ".join(extra_keys)))
        mismatch_keys = self.find_mismatch_keys()
        if mismatch_keys:
            raise TypeError("Wrong param type for class<{}>: {}".format(
                self.name, ", ".join(mismatch_keys)))


class SharedConfig(object):
    """
    Representation class for `__shared__` annotations, which work as follows:

    - if `key` is set for the module in config file, its value will take
      precedence
    - if `key` is not set for the module but present in the config file, its
      value will be used
    - otherwise, use the provided `default_value` as fallback

    Args:
        key: config[key] will be injected
        default_value: fallback value
    """

    def __init__(self, key, default_value=None):
        super(SharedConfig, self).__init__()
        self.key = key
        self.default_value = default_value


def extract_schema(cls):
    """
    Extract schema from a given class

    Args:
        cls (type): Class from which to extract.

    Returns:
        schema (SchemaDict): Extracted schema.
    """
    ctor = cls.__init__
    # python 2 compatibility
    if hasattr(inspect, 'getfullargspec'):
        argspec = inspect.getfullargspec(ctor)
        annotations = argspec.annotations
        has_kwargs = argspec.varkw is not None
    else:
        argspec = inspect.getfullargspec(ctor)
        # python 2 type hinting workaround, see pep-3107
        # however, since `typeguard` does not support python 2, type checking
        # is still python 3 only for now
        annotations = getattr(ctor, '__annotations__', {})
        has_kwargs = argspec.varkw is not None

    names = [arg for arg in argspec.args if arg != 'self']
    defaults = argspec.defaults
    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
    num_required = len(names) - num_defaults

    docs = cls.__doc__
    if docs is None and getattr(cls, '__category__', None) == 'op':
        docs = cls.__call__.__doc__
    try:
        docstring = doc_parse(docs)
    except Exception:
        docstring = None

    if docstring is None:
        comments = {}
    else:
        comments = {}
        for p in docstring.params:
            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
            if match_obj is not None:
                comments[match_obj.group(1)] = p.description

    schema = SchemaDict()
    schema.name = cls.__name__
    schema.doc = ""
    if docs is not None:
        start_pos = docs[0] == '\n' and 1 or 0
        schema.doc = docs[start_pos:].split("\n")[0].strip()
    # XXX handle paddle's weird doc convention
    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
        schema.doc = schema.doc[2:-2].strip()
    schema.category = hasattr(cls, '__category__') and getattr(
        cls, '__category__') or 'module'
    schema.strict = not has_kwargs
    schema.pymodule = importlib.import_module(cls.__module__)
    schema.inject = getattr(cls, '__inject__', [])
    schema.shared = getattr(cls, '__shared__', [])
    for idx, name in enumerate(names):
        comment = name in comments and comments[name] or name
        if name in schema.inject:
            type_ = None
        else:
            type_ = name in annotations and annotations[name] or None
        value_schema = SchemaValue(name, comment, type_)
        if name in schema.shared:
            assert idx >= num_required, "shared config must have default value"
            default = defaults[idx - num_required]
            value_schema.set_default(SharedConfig(name, default))
        elif idx >= num_required:
            default = defaults[idx - num_required]
            value_schema.set_default(default)
        schema.set_schema(name, value_schema)

    return schema


================================================
FILE: ppdet/core/config/yaml_helpers.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import inspect

import yaml
from .schema import SharedConfig

__all__ = ['serializable', 'Callable']


def represent_dictionary_order(self, dict_data):
    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())


def setup_orderdict():
    from collections import OrderedDict
    yaml.add_representer(OrderedDict, represent_dictionary_order)


def _make_python_constructor(cls):
    def python_constructor(loader, node):
        if isinstance(node, yaml.SequenceNode):
            args = loader.construct_sequence(node, deep=True)
            return cls(*args)
        else:
            kwargs = loader.construct_mapping(node, deep=True)
            try:
                return cls(**kwargs)
            except Exception as ex:
                print("Error when construct {} instance from yaml config".
                      format(cls.__name__))
                raise ex

    return python_constructor


def _make_python_representer(cls):
    # python 2 compatibility
    if hasattr(inspect, 'getfullargspec'):
        argspec = inspect.getfullargspec(cls)
    else:
        argspec = inspect.getfullargspec(cls.__init__)
    argnames = [arg for arg in argspec.args if arg != 'self']

    def python_representer(dumper, obj):
        if argnames:
            data = {name: getattr(obj, name) for name in argnames}
        else:
            data = obj.__dict__
        if '_id' in data:
            del data['_id']
        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)

    return python_representer


def serializable(cls):
    """
    Add loader and dumper for given class, which must be
    "trivially serializable"

    Args:
        cls: class to be serialized

    Returns: cls
    """
    yaml.add_constructor(u'!{}'.format(cls.__name__),
                         _make_python_constructor(cls))
    yaml.add_representer(cls, _make_python_representer(cls))
    return cls


yaml.add_representer(SharedConfig,
                     lambda d, o: d.represent_data(o.default_value))


@serializable
class Callable(object):
    """
    Helper to be used in Yaml for creating arbitrary class objects

    Args:
        full_type (str): the full module path to target function
    """

    def __init__(self, full_type, args=[], kwargs={}):
        super(Callable, self).__init__()
        self.full_type = full_type
        self.args = args
        self.kwargs = kwargs

    def __call__(self):
        if '.' in self.full_type:
            idx = self.full_type.rfind('.')
            module = importlib.import_module(self.full_type[:idx])
            func_name = self.full_type[idx + 1:]
        else:
            try:
                module = importlib.import_module('builtins')
            except Exception:
                module = importlib.import_module('__builtin__')
            func_name = self.full_type

        func = getattr(module, func_name)
        return func(*self.args, **self.kwargs)


================================================
FILE: ppdet/core/workspace.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import importlib
import os
import sys

import yaml
import collections

try:
    collectionsAbc = collections.abc
except AttributeError:
    collectionsAbc = collections

from .config.schema import SchemaDict, SharedConfig, extract_schema
from .config.yaml_helpers import serializable

__all__ = [
    'global_config',
    'load_config',
    'merge_config',
    'get_registered_modules',
    'create',
    'register',
    'serializable',
    'dump_value',
]


def dump_value(value):
    # XXX this is hackish, but collections.abc is not available in python 2
    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
        value = yaml.dump(value, default_flow_style=True)
        value = value.replace('\n', '')
        value = value.replace('...', '')
        return "'{}'".format(value)
    else:
        # primitive types
        return str(value)


class AttrDict(dict):
    """Single level attribute dict, NOT recursive"""

    def __init__(self, **kwargs):
        super(AttrDict, self).__init__()
        super(AttrDict, self).update(kwargs)

    def __getattr__(self, key):
        if key in self:
            return self[key]
        raise AttributeError("object has no attribute '{}'".format(key))

    def __setattr__(self, key, value):
        self[key] = value

    def copy(self):
        new_dict = AttrDict()
        for k, v in self.items():
            new_dict.update({k: v})
        return new_dict


global_config = AttrDict()

BASE_KEY = '_BASE_'


# parse and load _BASE_ recursively
def _load_config_with_base(file_path):
    with open(file_path) as f:
        file_cfg = yaml.load(f, Loader=yaml.Loader)

    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
    if BASE_KEY in file_cfg:
        all_base_cfg = AttrDict()
        base_ymls = list(file_cfg[BASE_KEY])
        for base_yml in base_ymls:
            if base_yml.startswith("~"):
                base_yml = os.path.expanduser(base_yml)
            if not base_yml.startswith('/'):
                base_yml = os.path.join(os.path.dirname(file_path), base_yml)

            with open(base_yml) as f:
                base_cfg = _load_config_with_base(base_yml)
                all_base_cfg = merge_config(base_cfg, all_base_cfg)

        del file_cfg[BASE_KEY]
        return merge_config(file_cfg, all_base_cfg)

    return file_cfg


def load_config(file_path):
    """
    Load config from file.

    Args:
        file_path (str): Path of the config file to be loaded.

    Returns: global config
    """
    _, ext = os.path.splitext(file_path)
    assert ext in ['.yml', '.yaml'], "only support yaml files for now"

    # load config from file and merge into global config
    cfg = _load_config_with_base(file_path)
    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
    merge_config(cfg)

    return global_config


def dict_merge(dct, merge_dct):
    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
    updating only top-level keys, dict_merge recurses down into dicts nested
    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
    ``dct``.

    Args:
        dct: dict onto which the merge is executed
        merge_dct: dct merged into dct

    Returns: dct
    """
    for k, v in merge_dct.items():
        if (k in dct and isinstance(dct[k], dict) and
                isinstance(merge_dct[k], collectionsAbc.Mapping)):
            dict_merge(dct[k], merge_dct[k])
        else:
            dct[k] = merge_dct[k]
    return dct


def merge_config(config, another_cfg=None):
    """
    Merge config into global config or another_cfg.

    Args:
        config (dict): Config to be merged.

    Returns: global config
    """
    global global_config
    dct = another_cfg or global_config
    return dict_merge(dct, config)


def get_registered_modules():
    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}


def make_partial(cls):
    op_module = importlib.import_module(cls.__op__.__module__)
    op = getattr(op_module, cls.__op__.__name__)
    cls.__category__ = getattr(cls, '__category__', None) or 'op'

    def partial_apply(self, *args, **kwargs):
        kwargs_ = self.__dict__.copy()
        kwargs_.update(kwargs)
        return op(*args, **kwargs_)

    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
        if sys.version_info[0] > 2:
            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
            cls.__init__.__doc__ = op.__doc__
            cls.__call__ = partial_apply
            cls.__call__.__doc__ = op.__doc__
        else:
            # XXX work around for python 2
            partial_apply.__doc__ = op.__doc__
            cls.__call__ = partial_apply
    return cls


def register(cls):
    """
    Register a given module class.

    Args:
        cls (type): Module class to be registered.

    Returns: cls
    """
    if cls.__name__ in global_config:
        raise ValueError("Module class already registered: {}".format(
            cls.__name__))
    if hasattr(cls, '__op__'):
        cls = make_partial(cls)
    global_config[cls.__name__] = extract_schema(cls)
    return cls


def create(cls_or_name, **kwargs):
    """
    Create an instance of given module class.

    Args:
        cls_or_name (type or str): Class of which to create instance.

    Returns: instance of type `cls_or_name`
    """
    assert type(cls_or_name) in [type, str
                                 ], "should be a class or name of a class"
    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
    if name in global_config:
        if isinstance(global_config[name], SchemaDict):
            pass
        elif hasattr(global_config[name], "__dict__"):
            # support instance return directly
            return global_config[name]
        else:
            raise ValueError("The module {} is not registered".format(name))
    else:
        raise ValueError("The module {} is not registered".format(name))

    config = global_config[name]
    cls = getattr(config.pymodule, name)
    cls_kwargs = {}
    cls_kwargs.update(global_config[name])

    # parse `shared` annoation of registered modules
    if getattr(config, 'shared', None):
        for k in config.shared:
            target_key = config[k]
            shared_conf = config.schema[k].default
            assert isinstance(shared_conf, SharedConfig)
            if target_key is not None and not isinstance(target_key,
                                                         SharedConfig):
                continue  # value is given for the module
            elif shared_conf.key in global_config:
                # `key` is present in config
                cls_kwargs[k] = global_config[shared_conf.key]
            else:
                cls_kwargs[k] = shared_conf.default_value

    # parse `inject` annoation of registered modules
    if getattr(cls, 'from_config', None):
        cls_kwargs.update(cls.from_config(config, **kwargs))

    if getattr(config, 'inject', None):
        for k in config.inject:
            target_key = config[k]
            # optional dependency
            if target_key is None:
                continue

            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
                if 'name' not in target_key.keys():
                    continue
                inject_name = str(target_key['name'])
                if inject_name not in global_config:
                    raise ValueError(
                        "Missing injection name {} and check it's name in cfg file".
                        format(k))
                target = global_config[inject_name]
                for i, v in target_key.items():
                    if i == 'name':
                        continue
                    target[i] = v
                if isinstance(target, SchemaDict):
                    cls_kwargs[k] = create(inject_name)
            elif isinstance(target_key, str):
                if target_key not in global_config:
                    raise ValueError("Missing injection config:", target_key)
                target = global_config[target_key]
                if isinstance(target, SchemaDict):
                    cls_kwargs[k] = create(target_key)
                elif hasattr(target, '__dict__'):  # serialized object
                    cls_kwargs[k] = target
            else:
                raise ValueError("Unsupported injection type:", target_key)
    # prevent modification of global config values of reference types
    # (e.g., list, dict) from within the created module instances
    #kwargs = copy.deepcopy(kwargs)
    return cls(**cls_kwargs)


================================================
FILE: ppdet/data/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import source
from . import transform
from . import reader

from .source import *
from .transform import *
from .reader import *


================================================
FILE: ppdet/data/crop_utils/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

================================================
FILE: ppdet/data/crop_utils/annotation_cropper.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import math
import random
import numpy as np
from copy import deepcopy
from typing import List, Tuple
from collections import defaultdict

from .chip_box_utils import nms, transform_chip_boxes2image_boxes
from .chip_box_utils import find_chips_to_cover_overlaped_boxes
from .chip_box_utils import transform_chip_box
from .chip_box_utils import intersection_over_box


class AnnoCropper(object):
    def __init__(self,
                 image_target_sizes: List[int],
                 valid_box_ratio_ranges: List[List[float]],
                 chip_target_size: int,
                 chip_target_stride: int,
                 use_neg_chip: bool=False,
                 max_neg_num_per_im: int=8,
                 max_per_img: int=-1,
                 nms_thresh: int=0.5):
        """
        Generate chips by chip_target_size and chip_target_stride.
        These two parameters just like kernel_size and stride in cnn.

        Each image has its raw size. After resizing, then get its target size.
        The resizing scale = target_size / raw_size.
        So are chips of the image.
        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
        The 'size' above mentioned is the size of long-side of image, box or chip.

        :param image_target_sizes: [2000, 1000]
        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]
        :param chip_target_size: 500
        :param chip_target_stride: 200
        """
        self.target_sizes = image_target_sizes
        self.valid_box_ratio_ranges = valid_box_ratio_ranges
        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
        self.scale_num = len(self.target_sizes)
        self.chip_target_size = chip_target_size  # is target size
        self.chip_target_stride = chip_target_stride  # is target stride
        self.use_neg_chip = use_neg_chip
        self.max_neg_num_per_im = max_neg_num_per_im
        self.max_per_img = max_per_img
        self.nms_thresh = nms_thresh

    def crop_anno_records(self, records: List[dict]):
        """
        The main logic:
        # foreach record(image):
        #   foreach scale:
        #     1 generate chips by chip size and stride for each scale
        #     2 get pos chips
        #     - validate boxes: current scale; h,w >= 1
        #     - find pos chips greedily by valid gt boxes in each scale
        #     - for every valid gt box, find its corresponding pos chips in each scale
        #     3 get neg chips
        #     - If given proposals, find neg boxes in them which are not in pos chips
        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
        # 4 sample neg chips if too much each image
        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations

        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
                        model and maybe have neg boxes in them.
        :return: new_records, list of dict like
        {
            'im_file': 'fake_image1.jpg',
            'im_id': np.array([1]),  # new _global_chip_id as im_id
            'h': h,  # chip height
            'w': w,  # chip width
            'is_crowd': is_crowd,  # Nx1 -> Mx1
            'gt_class': gt_class,  # Nx1 -> Mx1
            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
            'gt_poly': gt_poly,  # [None]xN -> [None]xM
            'chip': [x1, y1, x2, y2]  # added
        }

        Attention:
        ------------------------------>x
        |
        |    (x1,y1)------
        |       |        |
        |       |        |
        |       |        |
        |       |        |
        |       |        |
        |       ----------
        |                 (x2,y2)
        |
        ↓
        y

        If we use [x1, y1, x2, y2] to represent boxes or chips,
        (x1,y1) is the left-top point which is in the box,
        but (x2,y2) is the right-bottom point which is not in the box.
        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
        """

        self.chip_records = []
        self._global_chip_id = 1
        for r in records:
            self._cur_im_pos_chips = [
            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
            for scale_i in range(self.scale_num):
                self._get_current_scale_parameters(scale_i, r)

                # Cx4
                chips = self._create_chips(r['h'], r['w'], self._cur_scale)

                # # dict: chipid->[box_id, ...]
                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
                    r['gt_bbox'], chips)

                # dict: chipid->neg_box_num
                neg_chip2box_num = self._get_neg_boxes_and_chips(
                    chips,
                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))

                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
                                          neg_chip2box_num)

            cur_image_records = self._trans_all_chips2annotations(r)
            self.chip_records.extend(cur_image_records)
        return self.chip_records

    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):
        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
            chip = np.array(chips[pos_chipid])  # copy chips slice
            self._cur_im_pos_chips.append((chip, boxes_idx))

        if neg_chip2box_num is None:
            return

        for neg_chipid, neg_box_num in neg_chip2box_num.items():
            chip = np.array(chips[neg_chipid])
            self._cur_im_neg_chips.append((chip, neg_box_num))

    def _trans_all_chips2annotations(self, r):
        gt_bbox = r['gt_bbox']
        im_file = r['im_file']
        is_crowd = r['is_crowd']
        gt_class = r['gt_class']
        # gt_poly = r['gt_poly']   # [None]xN
        # remaining keys: im_id, h, w
        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
                                                         is_crowd, gt_class)

        if not self.use_neg_chip:
            return chip_records

        sampled_neg_chips = self._sample_neg_chips()
        neg_chip_records = self._trans_neg_chips2annotations(im_file,
                                                             sampled_neg_chips)
        chip_records.extend(neg_chip_records)
        return chip_records

    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
                                     gt_class):
        chip_records = []
        for chip, boxes_idx in self._cur_im_pos_chips:
            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
                                                            chip)
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            rec = {
                'im_file': im_file,
                'im_id': np.array([self._global_chip_id]),
                'h': chip_h,
                'w': chip_w,
                'gt_bbox': chip_bbox,
                'is_crowd': is_crowd[final_boxes_idx].copy(),
                'gt_class': gt_class[final_boxes_idx].copy(),
                # 'gt_poly': [None] * len(final_boxes_idx),
                'chip': chip
            }
            self._global_chip_id += 1
            chip_records.append(rec)
        return chip_records

    def _sample_neg_chips(self):
        pos_num = len(self._cur_im_pos_chips)
        neg_num = len(self._cur_im_neg_chips)
        sample_num = min(pos_num + 2, self.max_neg_num_per_im)
        assert sample_num >= 1
        if neg_num <= sample_num:
            return self._cur_im_neg_chips

        candidate_num = int(sample_num * 1.5)
        candidate_neg_chips = sorted(
            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
        random.shuffle(candidate_neg_chips)
        sampled_neg_chips = candidate_neg_chips[:sample_num]
        return sampled_neg_chips

    def _trans_neg_chips2annotations(self,
                                     im_file: str,
                                     sampled_neg_chips: List[Tuple]):
        chip_records = []
        for chip, neg_box_num in sampled_neg_chips:
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            rec = {
                'im_file': im_file,
                'im_id': np.array([self._global_chip_id]),
                'h': chip_h,
                'w': chip_w,
                'gt_bbox': np.zeros(
                    (0, 4), dtype=np.float32),
                'is_crowd': np.zeros(
                    (0, 1), dtype=np.int32),
                'gt_class': np.zeros(
                    (0, 1), dtype=np.int32),
                # 'gt_poly': [],
                'chip': chip
            }
            self._global_chip_id += 1
            chip_records.append(rec)
        return chip_records

    def _get_current_scale_parameters(self, scale_i, r):
        im_size = max(r['h'], r['w'])
        im_target_size = self.target_sizes[scale_i]
        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
        self._cur_scale = self._get_current_scale(im_target_size, im_size)
        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]

    def _get_current_scale(self, im_target_size, im_size):
        return im_target_size / im_size

    def _create_chips(self, h: int, w: int, scale: float):
        """
        Generate chips by chip_target_size and chip_target_stride.
        These two parameters just like kernel_size and stride in cnn.
        :return: chips, Cx4, xy in raw size dimension
        """
        chip_size = self.chip_target_size  # omit target for simplicity
        stride = self.chip_target_stride
        width = int(scale * w)
        height = int(scale * h)
        min_chip_location_diff = 20  # in target size

        assert chip_size >= stride
        chip_overlap = chip_size - stride
        if (width - chip_overlap
            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大，则保留
            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
        else:  # 不能被stride整除的部分比较小，则丢弃
            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
        if (height - chip_overlap) % stride > min_chip_location_diff:
            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
        else:
            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))

        chips = list()
        for j in range(h_steps):
            for i in range(w_steps):
                x1 = i * stride
                y1 = j * stride
                x2 = min(x1 + chip_size, width)
                y2 = min(y1 + chip_size, height)
                chips.append([x1, y1, x2, y2])

        # check  chip size
        for item in chips:
            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
                    1] > chip_size * 1.1:
                raise ValueError(item)
        chips = np.array(chips, dtype=np.float32)

        raw_size_chips = chips / scale
        return raw_size_chips

    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
        valid_ratio_range = self._cur_valid_ratio_range
        im_size = self._cur_im_size
        scale = self._cur_scale
        #   Nx4            N
        valid_boxes, valid_boxes_idx = self._validate_boxes(
            valid_ratio_range, im_size, gt_bbox, scale)
        # dict: chipid->[box_id, ...]
        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
                                                  valid_boxes_idx)
        return pos_chip2boxes_idx

    def _validate_boxes(self,
                        valid_ratio_range: List[float],
                        im_size: int,
                        gt_boxes: 'np.array of Nx4',
                        scale: float):
        """
        :return: valid_boxes: Nx4, valid_boxes_idx: N
        """
        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
        maxs = np.maximum(ws, hs)
        box_ratio = maxs / im_size
        mins = np.minimum(ws, hs)
        target_mins = mins * scale

        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
            np.float32).max

        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
            target_mins >= 2))[0]
        valid_boxes = gt_boxes[valid_boxes_idx]
        return valid_boxes, valid_boxes_idx

    def _find_pos_chips(self,
                        chips: 'Cx4',
                        valid_boxes: 'Bx4',
                        valid_boxes_idx: 'B'):
        """
        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
        """
        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB

        iob_threshold_to_find_chips = 1.
        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
            iob, iob_threshold_to_find_chips)
        pos_chip_ids = set(pos_chip_ids)

        iob_threshold_to_assign_box = 0.5
        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
        return pos_chip2boxes_idx

    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)

    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
                                   valid_boxes_idx):
        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
        pos_chip2boxes_idx = defaultdict(list)
        for chip_id, box_id in zip(chip_ids, box_ids):
            if chip_id not in pos_chip_ids:
                continue
            raw_gt_box_idx = valid_boxes_idx[box_id]
            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
        return pos_chip2boxes_idx

    def _get_neg_boxes_and_chips(self,
                                 chips: 'Cx4',
                                 pos_chip_ids: 'D',
                                 proposals: 'Px4'):
        """
        :param chips:
        :param pos_chip_ids:
        :param proposals:
        :return: neg_chip2box_num, None or dict: chipid->neg_box_num
        """
        if not self.use_neg_chip:
            return None

        # train proposals maybe None
        if proposals is None or len(proposals) < 1:
            return None

        valid_ratio_range = self._cur_valid_ratio_range
        im_size = self._cur_im_size
        scale = self._cur_scale

        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
                                              proposals, scale)
        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
        return neg_chip2box_num

    def _find_neg_boxes(self,
                        chips: 'Cx4',
                        pos_chip_ids: 'D',
                        valid_props: 'Px4'):
        """
        :return: neg_boxes: Nx4
        """
        if len(pos_chip_ids) == 0:
            return valid_props

        pos_chips = chips[pos_chip_ids]
        iob = intersection_over_box(pos_chips, valid_props)
        overlap_per_prop = np.max(iob, axis=0)
        non_overlap_props_idx = overlap_per_prop < 0.5
        neg_boxes = valid_props[non_overlap_props_idx]
        return neg_boxes

    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
                        neg_boxes: 'Nx4'):
        """
        :return: neg_chip2box_num, dict: chipid->neg_box_num
        """
        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
        neg_chips = chips[neg_chip_ids]

        iob = intersection_over_box(neg_chips, neg_boxes)
        iob_threshold_to_find_chips = 0.7
        chosen_neg_chip_ids, chip_id2overlap_box_num = \
            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)

        neg_chipid2box_num = {}
        for cid in chosen_neg_chip_ids:
            box_num = chip_id2overlap_box_num[cid]
            raw_chip_id = neg_chip_ids[cid]
            neg_chipid2box_num[raw_chip_id] = box_num
        return neg_chipid2box_num

    def crop_infer_anno_records(self, records: List[dict]):
        """
        transform image record to chips record
        :param records:
        :return: new_records, list of dict like
        {
            'im_file': 'fake_image1.jpg',
            'im_id': np.array([1]),  # new _global_chip_id as im_id
            'h': h,  # chip height
            'w': w,  # chip width
            'chip': [x1, y1, x2, y2]  # added
            'ori_im_h': ori_im_h  # added, origin image height
            'ori_im_w': ori_im_w  # added, origin image width
            'scale_i': 0  # added,
        }
        """
        self.chip_records = []
        self._global_chip_id = 1  # im_id start from 1
        self._global_chip_id2img_id = {}

        for r in records:
            for scale_i in range(self.scale_num):
                self._get_current_scale_parameters(scale_i, r)
                # Cx4
                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
                cur_img_chip_record = self._get_chips_records(r, chips, scale_i)
                self.chip_records.extend(cur_img_chip_record)

        return self.chip_records

    def _get_chips_records(self, rec, chips, scale_i):
        cur_img_chip_records = []
        ori_im_h = rec["h"]
        ori_im_w = rec["w"]
        im_file = rec["im_file"]
        ori_im_id = rec["im_id"]
        for id, chip in enumerate(chips):
            chip_rec = {}
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            chip_rec["im_file"] = im_file
            chip_rec["im_id"] = self._global_chip_id
            chip_rec["h"] = chip_h
            chip_rec["w"] = chip_w
            chip_rec["chip"] = chip
            chip_rec["ori_im_h"] = ori_im_h
            chip_rec["ori_im_w"] = ori_im_w
            chip_rec["scale_i"] = scale_i

            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
            self._global_chip_id += 1
            cur_img_chip_records.append(chip_rec)

        return cur_img_chip_records

    def aggregate_chips_detections(self, results, records=None):
        """
        # 1. transform chip dets to image dets
        # 2. nms boxes per image;
        # 3. format output results
        :param results:
        :param roidb:
        :return:
        """
        results = deepcopy(results)
        records = records if records else self.chip_records
        img_id2bbox = self._transform_chip2image_bboxes(results, records)
        nms_img_id2bbox = self._nms_dets(img_id2bbox)
        aggregate_results = self._reformat_results(nms_img_id2bbox)
        return aggregate_results

    def _transform_chip2image_bboxes(self, results, records):
        # 1. Transform chip dets to image dets;
        # 2. Filter valid range;
        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets
        img_id2bbox = defaultdict(list)
        for result in results:
            bbox_locs = result['bbox']
            bbox_nums = result['bbox_num']
            if len(bbox_locs) == 1 and bbox_locs[0][
                    0] == -1:  # current batch has no detections
                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
                continue
            im_ids = result['im_id']  # replace with range(len(bbox_nums))

            last_bbox_num = 0
            for idx, im_id in enumerate(im_ids):

                cur_bbox_len = bbox_nums[idx]
                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
                last_bbox_num += cur_bbox_len
                # box: [num_id, score, xmin, ymin, xmax, ymax]
                if len(bboxes) == 0:  # current image has no detections
                    continue

                chip_rec = records[int(im_id) -
                                   1]  # im_id starts from 1, type is np.int64
                image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])

                bboxes = transform_chip_boxes2image_boxes(
                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
                    chip_rec["ori_im_w"])

                scale_i = chip_rec["scale_i"]
                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
                                                    image_size)
                _, valid_boxes_idx = self._validate_boxes(
                    self.valid_box_ratio_ranges[scale_i], image_size,
                    bboxes[:, 2:], cur_scale)
                ori_img_id = self._global_chip_id2img_id[int(im_id)]

                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])

        return img_id2bbox

    def _nms_dets(self, img_id2bbox):
        # 1. NMS on each image-class
        # 2. Limit number of detections to MAX_PER_IMAGE if requested
        max_per_img = self.max_per_img
        nms_thresh = self.nms_thresh

        for img_id in img_id2bbox:
            box = img_id2bbox[
                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
            box = np.concatenate(box, axis=0)
            nms_dets = nms(box, nms_thresh)
            if max_per_img > 0:
                if len(nms_dets) > max_per_img:
                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
                    nms_dets = nms_dets[keep]

            img_id2bbox[img_id] = nms_dets

        return img_id2bbox

    def _reformat_results(self, img_id2bbox):
        """reformat results"""
        im_ids = img_id2bbox.keys()
        results = []
        for img_id in im_ids:  # output by original im_id order
            if len(img_id2bbox[img_id]) == 0:
                bbox = np.array(
                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
                bbox_num = np.array([0])
            else:
                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
                bbox = img_id2bbox[img_id]
                bbox_num = np.array([len(bbox)])
            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
            results.append(res)
        return results


================================================
FILE: ppdet/data/crop_utils/chip_box_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np


def bbox_area(boxes):
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


def intersection_over_box(chips, boxes):
    """
    intersection area over box area
    :param chips:  C
    :param boxes:  B
    :return: iob, CxB
    """
    M = chips.shape[0]
    N = boxes.shape[0]
    if M * N == 0:
        return np.zeros([M, N], dtype='float32')

    box_area = bbox_area(boxes)  # B

    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
                            boxes[:, 2:])  # CxBX2
    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
                            boxes[:, :2])  # CxBx2
    inter_wh = inter_x2y2 - inter_x1y1
    inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
    inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB

    iob = inter_area / np.expand_dims(box_area, 0)
    return iob


def clip_boxes(boxes, im_shape):
    """
    Clip boxes to image boundaries.
    :param boxes: [N, 4]
    :param im_shape: tuple of 2, [h, w]
    :return: [N, 4]
    """
    # x1 >= 0
    boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)
    # y1 >= 0
    boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)
    # x2 < im_shape[1]
    boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])
    # y2 < im_shape[0]
    boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])
    return boxes


def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
    boxes_idx = np.array(boxes_idx)
    cur_gt_bbox = gt_bbox[boxes_idx].copy()  # Bx4
    x1, y1, x2, y2 = chip
    cur_gt_bbox[:, 0] -= x1
    cur_gt_bbox[:, 1] -= y1
    cur_gt_bbox[:, 2] -= x1
    cur_gt_bbox[:, 3] -= y1
    h = y2 - y1
    w = x2 - x1
    cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))
    ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)
    hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)
    valid_idx = (ws >= 2) & (hs >= 2)
    return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]


def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
    chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
    chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
    chip_id2overlap_box_num = np.pad(
        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
        constant_values=0)

    chosen_chip_ids = []
    while len(box_ids) > 0:
        value_counts = np.bincount(chip_ids)  # 1d array
        max_count_chip_id = np.argmax(value_counts)
        assert max_count_chip_id not in chosen_chip_ids
        chosen_chip_ids.append(max_count_chip_id)

        box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
        ids_not_in_cur_boxes_mask = np.logical_not(
            np.isin(box_ids, box_ids_in_cur_chip))
        chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
        box_ids = box_ids[ids_not_in_cur_boxes_mask]
    return chosen_chip_ids, chip_id2overlap_box_num


def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):
    chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))
    xmin, ymin, _, _ = chip
    # Transform to origin image loc
    chip_boxes[:, 2] += xmin
    chip_boxes[:, 4] += xmin
    chip_boxes[:, 3] += ymin
    chip_boxes[:, 5] += ymin
    chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))
    return chip_boxes


def nms(dets, thresh):
    """Apply classic DPM-style greedy NMS."""
    if dets.shape[0] == 0:
        return dets[[], :]
    scores = dets[:, 1]
    x1 = dets[:, 2]
    y1 = dets[:, 3]
    x2 = dets[:, 4]
    y2 = dets[:, 5]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    ndets = dets.shape[0]
    suppressed = np.zeros((ndets), dtype=np.int32)

    # nominal indices
    # _i, _j
    # sorted indices
    # i, j
    # temp variables for box i's (the box currently under consideration)
    # ix1, iy1, ix2, iy2, iarea

    # variables for computing overlap with box j (lower scoring box)
    # xx1, yy1, xx2, yy2
    # w, h
    # inter, ovr

    for _i in range(ndets):
        i = order[_i]
        if suppressed[i] == 1:
            continue
        ix1 = x1[i]
        iy1 = y1[i]
        ix2 = x2[i]
        iy2 = y2[i]
        iarea = areas[i]
        for _j in range(_i + 1, ndets):
            j = order[_j]
            if suppressed[j] == 1:
                continue
            xx1 = max(ix1, x1[j])
            yy1 = max(iy1, y1[j])
            xx2 = min(ix2, x2[j])
            yy2 = min(iy2, y2[j])
            w = max(0.0, xx2 - xx1 + 1)
            h = max(0.0, yy2 - yy1 + 1)
            inter = w * h
            ovr = inter / (iarea + areas[j] - inter)
            if ovr >= thresh:
                suppressed[j] = 1
    keep = np.where(suppressed == 0)[0]
    dets = dets[keep, :]
    return dets


================================================
FILE: ppdet/data/culane_utils.py
================================================
import math
import numpy as np
from imgaug.augmentables.lines import LineString
from scipy.interpolate import InterpolatedUnivariateSpline


def lane_to_linestrings(lanes):
    lines = []
    for lane in lanes:
        lines.append(LineString(lane))

    return lines


def linestrings_to_lanes(lines):
    lanes = []
    for line in lines:
        lanes.append(line.coords)

    return lanes


def sample_lane(points, sample_ys, img_w):
    # this function expects the points to be sorted
    points = np.array(points)
    if not np.all(points[1:, 1] < points[:-1, 1]):
        raise Exception('Annotaion points have to be sorted')
    x, y = points[:, 0], points[:, 1]

    # interpolate points inside domain
    assert len(points) > 1
    interp = InterpolatedUnivariateSpline(
        y[::-1], x[::-1], k=min(3, len(points) - 1))
    domain_min_y = y.min()
    domain_max_y = y.max()
    sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & (
        sample_ys <= domain_max_y)]
    assert len(sample_ys_inside_domain) > 0
    interp_xs = interp(sample_ys_inside_domain)

    # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom
    two_closest_points = points[:2]
    extrap = np.polyfit(
        two_closest_points[:, 1], two_closest_points[:, 0], deg=1)
    extrap_ys = sample_ys[sample_ys > domain_max_y]
    extrap_xs = np.polyval(extrap, extrap_ys)
    all_xs = np.hstack((extrap_xs, interp_xs))

    # separate between inside and outside points
    inside_mask = (all_xs >= 0) & (all_xs < img_w)
    xs_inside_image = all_xs[inside_mask]
    xs_outside_image = all_xs[~inside_mask]

    return xs_outside_image, xs_inside_image


def filter_lane(lane):
    assert lane[-1][1] <= lane[0][1]
    filtered_lane = []
    used = set()
    for p in lane:
        if p[1] not in used:
            filtered_lane.append(p)
            used.add(p[1])

    return filtered_lane


def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys,
                         n_strips, strip_size, anno):
    old_lanes = anno['lanes']

    # removing lanes with less than 2 points
    old_lanes = filter(lambda x: len(x) > 1, old_lanes)
    # sort lane points by Y (bottom to top of the image)
    old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes]
    # remove points with same Y (keep first occurrence)
    old_lanes = [filter_lane(lane) for lane in old_lanes]
    # normalize the annotation coordinates
    old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)]
                  for x, y in lane] for lane in old_lanes]
    # create tranformed annotations
    lanes = np.ones(
        (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32
    ) * -1e5  # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates
    lanes_endpoints = np.ones((max_lanes, 2))
    # lanes are invalid by default
    lanes[:, 0] = 1
    lanes[:, 1] = 0
    for lane_idx, lane in enumerate(old_lanes):
        if lane_idx >= max_lanes:
            break

        try:
            xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys,
                                                            img_w)
        except AssertionError:
            continue
        if len(xs_inside_image) <= 1:
            continue
        all_xs = np.hstack((xs_outside_image, xs_inside_image))
        lanes[lane_idx, 0] = 0
        lanes[lane_idx, 1] = 1
        lanes[lane_idx, 2] = len(xs_outside_image) / n_strips
        lanes[lane_idx, 3] = xs_inside_image[0]

        thetas = []
        for i in range(1, len(xs_inside_image)):
            theta = math.atan(
                i * strip_size /
                (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi
            theta = theta if theta > 0 else 1 - abs(theta)
            thetas.append(theta)

        theta_far = sum(thetas) / len(thetas)

        # lanes[lane_idx,
        #       4] = (theta_closest + theta_far) / 2  # averaged angle
        lanes[lane_idx, 4] = theta_far
        lanes[lane_idx, 5] = len(xs_inside_image)
        lanes[lane_idx, 6:6 + len(all_xs)] = all_xs
        lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips
        lanes_endpoints[lane_idx, 1] = xs_inside_image[-1]

    new_anno = {
        'label': lanes,
        'old_anno': anno,
        'lane_endpoints': lanes_endpoints
    }
    return new_anno


================================================
FILE: ppdet/data/reader.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import os
import traceback
import six
import sys
if sys.version_info >= (3, 0):
    pass
else:
    pass
import numpy as np
import paddle
import paddle.nn.functional as F

from copy import deepcopy

from paddle.io import DataLoader, DistributedBatchSampler
from .utils import default_collate_fn

from ppdet.core.workspace import register
from . import transform
from .shm_utils import _get_shared_memory_size_in_M

from ppdet.utils.logger import setup_logger
logger = setup_logger('reader')

MAIN_PID = os.getpid()


class Compose(object):
    def __init__(self, transforms, num_classes=80):
        self.transforms = transforms
        self.transforms_cls = []
        for t in self.transforms:
            for k, v in t.items():
                op_cls = getattr(transform, k)
                f = op_cls(**v)
                if hasattr(f, 'num_classes'):
                    f.num_classes = num_classes

                self.transforms_cls.append(f)

    def _update_transforms_cls(self, data):
        if 'transform_schedulers' in data:
            def is_valid(op):
                op_name = op.__class__.__name__
                for t in data['transform_schedulers']:
                    for k, v in t.items():
                        if op_name == k:
                            # [start_epoch, stop_epoch)
                            start_epoch = v.get('start_epoch', 0)
                            if start_epoch > data['curr_epoch']:
                                return False
                            stop_epoch = v.get('stop_epoch', float('inf'))
                            if stop_epoch <= data['curr_epoch']:
                                return False
                return True

            return filter(is_valid, self.transforms_cls)
        else:
            return self.transforms_cls

    def __call__(self, data):
        transforms_cls = self._update_transforms_cls(data)
        for f in transforms_cls:
            try:
                data = f(data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map sample transform [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        return data


class BatchCompose(Compose):
    def __init__(self, transforms, num_classes=80, collate_batch=True):
        super(BatchCompose, self).__init__(transforms, num_classes)
        self.collate_batch = collate_batch

    def __call__(self, data):
        transforms_cls = self._update_transforms_cls(data[0])
        for f in transforms_cls:
            try:
                data = f(data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map batch transform [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        # remove keys which is not needed by model
        extra_key = ['h', 'w', 'flipped', 'transform_schedulers']
        for k in extra_key:
            for sample in data:
                if k in sample:
                    sample.pop(k)

        # batch data, if user-define batch function needed
        # use user-defined here
        if self.collate_batch:
            batch_data = default_collate_fn(data)
        else:
            batch_data = {}
            for k in data[0].keys():
                tmp_data = []
                for i in range(len(data)):
                    tmp_data.append(data[i][k])
                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
                    tmp_data = np.stack(tmp_data, axis=0)
                if 'origin_' in k:
                    tmp_data = np.stack(tmp_data, axis=0)
                batch_data[k] = tmp_data
        return batch_data


class BaseDataLoader(object):
    """
    Base DataLoader implementation for detection models

    Args:
        sample_transforms (list): a list of transforms to perform
                                  on each sample
        batch_transforms (list): a list of transforms to perform
                                 on batch
        batch_size (int): batch size for batch collating, default 1.
        shuffle (bool): whether to shuffle samples
        drop_last (bool): whether to drop the last incomplete,
                          default False
        num_classes (int): class number of dataset, default 80
        collate_batch (bool): whether to collate batch in dataloader.
            If set to True, the samples will collate into batch according
            to the batch size. Otherwise, the ground-truth will not collate,
            which is used when the number of ground-truch is different in
            samples.
        use_shared_memory (bool): whether to use shared memory to
                accelerate data loading, enable this only if you
                are sure that the shared memory size of your OS
                is larger than memory cost of input datas of model.
                Note that shared memory will be automatically
                disabled if the shared memory of OS is less than
                1G, which is not enough for detection models.
                Default False.
    """

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=False,
                 drop_last=False,
                 num_classes=80,
                 collate_batch=True,
                 use_shared_memory=False,
                 **kwargs):
        # sample transform
        self._sample_transforms = Compose(
            sample_transforms, num_classes=num_classes)

        # batch transfrom
        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
                                              collate_batch)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.use_shared_memory = use_shared_memory
        self.kwargs = kwargs

    def __call__(self,
                 dataset,
                 worker_num,
                 batch_sampler=None,
                 return_list=False):
        self.dataset = dataset
        self.dataset.check_or_download_dataset()
        self.dataset.parse_dataset()
        # get data
        self.dataset.set_transform(self._sample_transforms)
        # set kwargs
        self.dataset.set_kwargs(**self.kwargs)
        # batch sampler
        if batch_sampler is None:
            self._batch_sampler = DistributedBatchSampler(
                self.dataset,
                batch_size=self.batch_size,
                shuffle=self.shuffle,
                drop_last=self.drop_last)
        else:
            self._batch_sampler = batch_sampler

        # DataLoader do not start sub-process in Windows and Mac
        # system, do not need to use shared memory
        use_shared_memory = self.use_shared_memory and \
                            sys.platform not in ['win32', 'darwin']
        # check whether shared memory size is bigger than 1G(1024M)
        if use_shared_memory:
            shm_size = _get_shared_memory_size_in_M()
            if shm_size is not None and shm_size < 1024.:
                logger.warning("Shared memory size is less than 1G, "
                               "disable shared_memory in DataLoader")
                use_shared_memory = False

        self.dataloader = DataLoader(
            dataset=self.dataset,
            batch_sampler=self._batch_sampler,
            collate_fn=self._batch_transforms,
            num_workers=worker_num,
            return_list=return_list,
            use_shared_memory=use_shared_memory)
        self.loader = iter(self.dataloader)

        return self

    def __len__(self):
        return len(self._batch_sampler)

    def __iter__(self):
        return self

    def __next__(self):
        try:
            return next(self.loader)
        except StopIteration:
            self.loader = iter(self.dataloader)
            six.reraise(*sys.exc_info())

    def next(self):
        # python2 compatibility
        return self.__next__()


@register
class TrainReader(BaseDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=True,
                 drop_last=True,
                 num_classes=80,
                 collate_batch=True,
                 **kwargs):
        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
                                          batch_size, shuffle, drop_last,
                                          num_classes, collate_batch, **kwargs)


@register
class EvalReader(BaseDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=False,
                 drop_last=False,
                 num_classes=80,
                 **kwargs):
        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
                                         batch_size, shuffle, drop_last,
                                         num_classes, **kwargs)


@register
class TestReader(BaseDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=False,
                 drop_last=False,
                 num_classes=80,
                 **kwargs):
        super(TestReader, self).__init__(sample_transforms, batch_transforms,
                                         batch_size, shuffle, drop_last,
                                         num_classes, **kwargs)


@register
class EvalMOTReader(BaseDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=False,
                 drop_last=False,
                 num_classes=1,
                 **kwargs):
        super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,
                                            batch_size, shuffle, drop_last,
                                            num_classes, **kwargs)


@register
class TestMOTReader(BaseDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 batch_transforms=[],
                 batch_size=1,
                 shuffle=False,
                 drop_last=False,
                 num_classes=1,
                 **kwargs):
        super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
                                            batch_size, shuffle, drop_last,
                                            num_classes, **kwargs)


# For Semi-Supervised Object Detection (SSOD)
class Compose_SSOD(object):
    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
        self.base_transforms = base_transforms
        self.base_transforms_cls = []
        for t in self.base_transforms:
            for k, v in t.items():
                op_cls = getattr(transform, k)
                f = op_cls(**v)
                if hasattr(f, 'num_classes'):
                    f.num_classes = num_classes
                self.base_transforms_cls.append(f)

        self.weak_augs = weak_aug
        self.weak_augs_cls = []
        for t in self.weak_augs:
            for k, v in t.items():
                op_cls = getattr(transform, k)
                f = op_cls(**v)
                if hasattr(f, 'num_classes'):
                    f.num_classes = num_classes
                self.weak_augs_cls.append(f)

        self.strong_augs = strong_aug
        self.strong_augs_cls = []
        for t in self.strong_augs:
            for k, v in t.items():
                op_cls = getattr(transform, k)
                f = op_cls(**v)
                if hasattr(f, 'num_classes'):
                    f.num_classes = num_classes
                self.strong_augs_cls.append(f)

    def __call__(self, data):
        for f in self.base_transforms_cls:
            try:
                data = f(data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map sample transform [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        weak_data = deepcopy(data)
        strong_data = deepcopy(data)
        for f in self.weak_augs_cls:
            try:
                weak_data = f(weak_data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map weak aug [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        for f in self.strong_augs_cls:
            try:
                strong_data = f(strong_data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map strong aug [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        weak_data['strong_aug'] = strong_data
        return weak_data


class BatchCompose_SSOD(Compose):
    def __init__(self, transforms, num_classes=80, collate_batch=True):
        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
        self.collate_batch = collate_batch

    def __call__(self, data):
        # split strong_data from data(weak_data)
        strong_data = []
        for sample in data:
            strong_data.append(sample['strong_aug'])
            sample.pop('strong_aug')

        for f in self.transforms_cls:
            try:
                data = f(data)
                if 'BatchRandomResizeForSSOD' in f._id:
                    strong_data = f(strong_data, data[1])[0]
                    data = data[0]
                else:
                    strong_data = f(strong_data)
            except Exception as e:
                stack_info = traceback.format_exc()
                logger.warning("fail to map batch transform [{}] "
                               "with error: {} and stack:\n{}".format(
                                   f, e, str(stack_info)))
                raise e

        # remove keys which is not needed by model
        extra_key = ['h', 'w', 'flipped']
        for k in extra_key:
            for sample in data:
                if k in sample:
                    sample.pop(k)
            for sample in strong_data:
                if k in sample:
                    sample.pop(k)

        # batch data, if user-define batch function needed
        # use user-defined here
        if self.collate_batch:
            batch_data = default_collate_fn(data)
            strong_batch_data = default_collate_fn(strong_data)
            return batch_data, strong_batch_data
        else:
            batch_data = {}
            for k in data[0].keys():
                tmp_data = []
                for i in range(len(data)):
                    tmp_data.append(data[i][k])
                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
                    tmp_data = np.stack(tmp_data, axis=0)
                batch_data[k] = tmp_data

            strong_batch_data = {}
            for k in strong_data[0].keys():
                tmp_data = []
                for i in range(len(strong_data)):
                    tmp_data.append(strong_data[i][k])
                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
                    tmp_data = np.stack(tmp_data, axis=0)
                strong_batch_data[k] = tmp_data

        return batch_data, strong_batch_data


class CombineSSODLoader(object):
    def __init__(self, label_loader, unlabel_loader):
        self.label_loader = label_loader
        self.unlabel_loader = unlabel_loader

    def __iter__(self):
        while True:
            try:
                label_samples = next(self.label_loader_iter)
            except:
                self.label_loader_iter = iter(self.label_loader)
                label_samples = next(self.label_loader_iter)

            try:
                unlabel_samples = next(self.unlabel_loader_iter)
            except:
                self.unlabel_loader_iter = iter(self.unlabel_loader)
                unlabel_samples = next(self.unlabel_loader_iter)

            yield (
                label_samples[0],  # sup weak
                label_samples[1],  # sup strong
                unlabel_samples[0],  # unsup weak
                unlabel_samples[1]  # unsup strong
            )

    def __call__(self):
        return self.__iter__()


class BaseSemiDataLoader(object):
    def __init__(self,
                 sample_transforms=[],
                 weak_aug=[],
                 strong_aug=[],
                 sup_batch_transforms=[],
                 unsup_batch_transforms=[],
                 sup_batch_size=1,
                 unsup_batch_size=1,
                 shuffle=True,
                 drop_last=True,
                 num_classes=80,
                 collate_batch=True,
                 use_shared_memory=False,
                 **kwargs):
        # sup transforms
        self._sample_transforms_label = Compose_SSOD(
            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
        self._batch_transforms_label = BatchCompose_SSOD(
            sup_batch_transforms, num_classes, collate_batch)
        self.batch_size_label = sup_batch_size

        # unsup transforms
        self._sample_transforms_unlabel = Compose_SSOD(
            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
        self._batch_transforms_unlabel = BatchCompose_SSOD(
            unsup_batch_transforms, num_classes, collate_batch)
        self.batch_size_unlabel = unsup_batch_size

        # common
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.use_shared_memory = use_shared_memory
        self.kwargs = kwargs

    def __call__(self,
                 dataset_label,
                 dataset_unlabel,
                 worker_num,
                 batch_sampler_label=None,
                 batch_sampler_unlabel=None,
                 return_list=False):
        # sup dataset
        self.dataset_label = dataset_label
        self.dataset_label.check_or_download_dataset()
        self.dataset_label.parse_dataset()
        self.dataset_label.set_transform(self._sample_transforms_label)
        self.dataset_label.set_kwargs(**self.kwargs)
        if batch_sampler_label is None:
            self._batch_sampler_label = DistributedBatchSampler(
                self.dataset_label,
                batch_size=self.batch_size_label,
                shuffle=self.shuffle,
                drop_last=self.drop_last)
        else:
            self._batch_sampler_label = batch_sampler_label

        # unsup dataset
        self.dataset_unlabel = dataset_unlabel
        self.dataset_unlabel.length = self.dataset_label.__len__()
        self.dataset_unlabel.check_or_download_dataset()
        self.dataset_unlabel.parse_dataset()
        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
        self.dataset_unlabel.set_kwargs(**self.kwargs)
        if batch_sampler_unlabel is None:
            self._batch_sampler_unlabel = DistributedBatchSampler(
                self.dataset_unlabel,
                batch_size=self.batch_size_unlabel,
                shuffle=self.shuffle,
                drop_last=self.drop_last)
        else:
            self._batch_sampler_unlabel = batch_sampler_unlabel

        # DataLoader do not start sub-process in Windows and Mac
        # system, do not need to use shared memory
        use_shared_memory = self.use_shared_memory and \
                            sys.platform not in ['win32', 'darwin']
        # check whether shared memory size is bigger than 1G(1024M)
        if use_shared_memory:
            shm_size = _get_shared_memory_size_in_M()
            if shm_size is not None and shm_size < 1024.:
                logger.warning("Shared memory size is less than 1G, "
                               "disable shared_memory in DataLoader")
                use_shared_memory = False

        self.dataloader_label = DataLoader(
            dataset=self.dataset_label,
            batch_sampler=self._batch_sampler_label,
            collate_fn=self._batch_transforms_label,
            num_workers=worker_num,
            return_list=return_list,
            use_shared_memory=use_shared_memory)

        self.dataloader_unlabel = DataLoader(
            dataset=self.dataset_unlabel,
            batch_sampler=self._batch_sampler_unlabel,
            collate_fn=self._batch_transforms_unlabel,
            num_workers=worker_num,
            return_list=return_list,
            use_shared_memory=use_shared_memory)

        self.dataloader = CombineSSODLoader(self.dataloader_label,
                                            self.dataloader_unlabel)
        self.loader = iter(self.dataloader)
        return self

    def __len__(self):
        return len(self._batch_sampler_label)

    def __iter__(self):
        return self

    def __next__(self):
        return next(self.loader)

    def next(self):
        # python2 compatibility
        return self.__next__()


@register
class SemiTrainReader(BaseSemiDataLoader):
    __shared__ = ['num_classes']

    def __init__(self,
                 sample_transforms=[],
                 weak_aug=[],
                 strong_aug=[],
                 sup_batch_transforms=[],
                 unsup_batch_transforms=[],
                 sup_batch_size=1,
                 unsup_batch_size=1,
                 shuffle=True,
                 drop_last=True,
                 num_classes=80,
                 collate_batch=True,
                 **kwargs):
        super(SemiTrainReader, self).__init__(
            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
            drop_last, num_classes, collate_batch, **kwargs)


================================================
FILE: ppdet/data/shm_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

SIZE_UNIT = ['K', 'M', 'G', 'T']
SHM_QUERY_CMD = 'df -h'
SHM_KEY = 'shm'
SHM_DEFAULT_MOUNT = '/dev/shm'

# [ shared memory size check ]
# In detection models, image/target data occupies a lot of memory, and
# will occupy lots of shared memory in multi-process DataLoader, we use
# following code to get shared memory size and perform a size check to
# disable shared memory use if shared memory size is not enough.
# Shared memory getting process as follows:
# 1. use `df -h` get all mount info
# 2. pick up spaces whose mount info contains 'shm'
# 3. if 'shm' space number is only 1, return its size
# 4. if there are multiple 'shm' space, try to find the default mount
#    directory '/dev/shm' is Linux-like system, otherwise return the
#    biggest space size.


def _parse_size_in_M(size_str):
    if size_str[-1] == 'B':
        num, unit = size_str[:-2], size_str[-2]
    else:
        num, unit = size_str[:-1], size_str[-1]
    assert unit in SIZE_UNIT, \
            "unknown shm size unit {}".format(unit)
    return float(num) * \
            (1024 ** (SIZE_UNIT.index(unit) - 1))


def _get_shared_memory_size_in_M():
    try:
        df_infos = os.popen(SHM_QUERY_CMD).readlines()
    except:
        return None
    else:
        shm_infos = []
        for df_info in df_infos:
            info = df_info.strip()
            if info.find(SHM_KEY) >= 0:
                shm_infos.append(info.split())

        if len(shm_infos) == 0:
            return None
        elif len(shm_infos) == 1:
            return _parse_size_in_M(shm_infos[0][3])
        else:
            default_mount_infos = [
                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
            ]
            if default_mount_infos:
                return _parse_size_in_M(default_mount_infos[0][3])
            else:
                return max([_parse_size_in_M(si[3]) for si in shm_infos])


================================================
FILE: ppdet/data/source/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import coco
from . import voc
from . import widerface
from . import category
from . import keypoint_coco
from . import mot
from . import sniper_coco
from . import culane
from . import lvis

from .coco import *
from .voc import *
from .widerface import *
from .category import *
from .keypoint_coco import *
from .mot import *
from .sniper_coco import SniperCOCODataSet
from .dataset import ImageFolder
from .pose3d_cmb import *
from .culane import *
from .lvis import *

================================================
FILE: ppdet/data/source/category.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

from ppdet.data.source.voc import pascalvoc_label
from ppdet.data.source.widerface import widerface_label
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['get_categories']


def get_categories(metric_type, anno_file=None, arch=None):
    """
    Get class id to category id map and category id
    to category name map from annotation file.

    Args:
        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
            and 'widerface'.
        anno_file (str): annotation file path
    """
    if arch == 'keypoint_arch':
        return (None, {'id': 'keypoint'})

    if anno_file == None or (not os.path.isfile(anno_file)):
        logger.warning(
            "anno_file '{}' is None or not set or not exist, "
            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
            "otherwise the default categories will be used by metric_type.".
            format(anno_file))

    if metric_type.lower() == 'coco' or metric_type.lower(
    ) == 'rbox' or metric_type.lower() == 'snipercoco':
        if anno_file and os.path.isfile(anno_file):
            if anno_file.endswith('json'):
                # lazy import pycocotools here
                from pycocotools.coco import COCO
                coco = COCO(anno_file)
                cats = coco.loadCats(coco.getCatIds())

                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
                catid2name = {cat['id']: cat['name'] for cat in cats}

            elif anno_file.endswith('txt'):
                cats = []
                with open(anno_file) as f:
                    for line in f.readlines():
                        cats.append(line.strip())
                if cats[0] == 'background': cats = cats[1:]

                clsid2catid = {i: i for i in range(len(cats))}
                catid2name = {i: name for i, name in enumerate(cats)}

            else:
                raise ValueError("anno_file {} should be json or txt.".format(
                    anno_file))
            return clsid2catid, catid2name

        # anno file not exist, load default categories of COCO17
        else:
            if metric_type.lower() == 'rbox':
                logger.warning(
                    "metric_type: {}, load default categories of DOTA.".format(
                        metric_type))
                return _dota_category()
            logger.warning("metric_type: {}, load default categories of COCO.".
                           format(metric_type))
            return _coco17_category()

    elif metric_type.lower() == 'voc':
        if anno_file and os.path.isfile(anno_file):
            cats = []
            with open(anno_file) as f:
                for line in f.readlines():
                    cats.append(line.strip())

            if cats[0] == 'background':
                cats = cats[1:]

            clsid2catid = {i: i for i in range(len(cats))}
            catid2name = {i: name for i, name in enumerate(cats)}

            return clsid2catid, catid2name

        # anno file not exist, load default categories of
        # VOC all 20 categories
        else:
            logger.warning("metric_type: {}, load default categories of VOC.".
                           format(metric_type))
            return _vocall_category()

    elif metric_type.lower() == 'oid':
        if anno_file and os.path.isfile(anno_file):
            logger.warning("only default categories support for OID19")
        return _oid19_category()

    elif metric_type.lower() == 'widerface':
        return _widerface_category()

    elif metric_type.lower() in [
            'keypointtopdowncocoeval', 'keypointtopdownmpiieval',
            'keypointtopdowncocowholebadyhandeval'
    ]:
        return (None, {'id': 'keypoint'})

    elif metric_type.lower() == 'pose3deval':
        return (None, {'id': 'pose3d'})

    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
        if anno_file and os.path.isfile(anno_file):
            cats = []
            with open(anno_file) as f:
                for line in f.readlines():
                    cats.append(line.strip())
            if cats[0] == 'background':
                cats = cats[1:]
            clsid2catid = {i: i for i in range(len(cats))}
            catid2name = {i: name for i, name in enumerate(cats)}
            return clsid2catid, catid2name
        # anno file not exist, load default category 'pedestrian'.
        else:
            logger.warning(
                "metric_type: {}, load default categories of pedestrian MOT.".
                format(metric_type))
            return _mot_category(category='pedestrian')

    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
        return _mot_category(category='vehicle')

    elif metric_type.lower() in ['mcmot']:
        if anno_file and os.path.isfile(anno_file):
            cats = []
            with open(anno_file) as f:
                for line in f.readlines():
                    cats.append(line.strip())
            if cats[0] == 'background':
                cats = cats[1:]
            clsid2catid = {i: i for i in range(len(cats))}
            catid2name = {i: name for i, name in enumerate(cats)}
            return clsid2catid, catid2name
        # anno file not exist, load default categories of visdrone all 10 categories
        else:
            logger.warning(
                "metric_type: {}, load default categories of VisDrone.".format(
                    metric_type))
            return _visdrone_category()

    else:
        raise ValueError("unknown metric type {}".format(metric_type))


def _mot_category(category='pedestrian'):
    """
    Get class id to category id map and category id
    to category name map of mot dataset
    """
    label_map = {category: 0}
    label_map = sorted(label_map.items(), key=lambda x: x[1])
    cats = [l[0] for l in label_map]

    clsid2catid = {i: i for i in range(len(cats))}
    catid2name = {i: name for i, name in enumerate(cats)}

    return clsid2catid, catid2name


def _coco17_category():
    """
    Get class id to category id map and category id
    to category name map of COCO2017 dataset

    """
    clsid2catid = {
        1: 1,
        2: 2,
        3: 3,
        4: 4,
        5: 5,
        6: 6,
        7: 7,
        8: 8,
        9: 9,
        10: 10,
        11: 11,
        12: 13,
        13: 14,
        14: 15,
        15: 16,
        16: 17,
        17: 18,
        18: 19,
        19: 20,
        20: 21,
        21: 22,
        22: 23,
        23: 24,
        24: 25,
        25: 27,
        26: 28,
        27: 31,
        28: 32,
        29: 33,
        30: 34,
        31: 35,
        32: 36,
        33: 37,
        34: 38,
        35: 39,
        36: 40,
        37: 41,
        38: 42,
        39: 43,
        40: 44,
        41: 46,
        42: 47,
        43: 48,
        44: 49,
        45: 50,
        46: 51,
        47: 52,
        48: 53,
        49: 54,
        50: 55,
        51: 56,
        52: 57,
        53: 58,
        54: 59,
        55: 60,
        56: 61,
        57: 62,
        58: 63,
        59: 64,
        60: 65,
        61: 67,
        62: 70,
        63: 72,
        64: 73,
        65: 74,
        66: 75,
        67: 76,
        68: 77,
        69: 78,
        70: 79,
        71: 80,
        72: 81,
        73: 82,
        74: 84,
        75: 85,
        76: 86,
        77: 87,
        78: 88,
        79: 89,
        80: 90
    }

    catid2name = {
        0: 'background',
        1: 'person',
        2: 'bicycle',
        3: 'car',
        4: 'motorcycle',
        5: 'airplane',
        6: 'bus',
        7: 'train',
        8: 'truck',
        9: 'boat',
        10: 'traffic light',
        11: 'fire hydrant',
        13: 'stop sign',
        14: 'parking meter',
        15: 'bench',
        16: 'bird',
        17: 'cat',
        18: 'dog',
        19: 'horse',
        20: 'sheep',
        21: 'cow',
        22: 'elephant',
        23: 'bear',
        24: 'zebra',
        25: 'giraffe',
        27: 'backpack',
        28: 'umbrella',
        31: 'handbag',
        32: 'tie',
        33: 'suitcase',
        34: 'frisbee',
        35: 'skis',
        36: 'snowboard',
        37: 'sports ball',
        38: 'kite',
        39: 'baseball bat',
        40: 'baseball glove',
        41: 'skateboard',
        42: 'surfboard',
        43: 'tennis racket',
        44: 'bottle',
        46: 'wine glass',
        47: 'cup',
        48: 'fork',
        49: 'knife',
        50: 'spoon',
        51: 'bowl',
        52: 'banana',
        53: 'apple',
        54: 'sandwich',
        55: 'orange',
        56: 'broccoli',
        57: 'carrot',
        58: 'hot dog',
        59: 'pizza',
        60: 'donut',
        61: 'cake',
        62: 'chair',
        63: 'couch',
        64: 'potted plant',
        65: 'bed',
        67: 'dining table',
        70: 'toilet',
        72: 'tv',
        73: 'laptop',
        74: 'mouse',
        75: 'remote',
        76: 'keyboard',
        77: 'cell phone',
        78: 'microwave',
        79: 'oven',
        80: 'toaster',
        81: 'sink',
        82: 'refrigerator',
        84: 'book',
        85: 'clock',
        86: 'vase',
        87: 'scissors',
        88: 'teddy bear',
        89: 'hair drier',
        90: 'toothbrush'
    }

    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
    catid2name.pop(0)

    return clsid2catid, catid2name


def _dota_category():
    """
    Get class id to category id map and category id
    to category name map of dota dataset
    """
    catid2name = {
        0: 'background',
        1: 'plane',
        2: 'baseball-diamond',
        3: 'bridge',
        4: 'ground-track-field',
        5: 'small-vehicle',
        6: 'large-vehicle',
        7: 'ship',
        8: 'tennis-court',
        9: 'basketball-court',
        10: 'storage-tank',
        11: 'soccer-ball-field',
        12: 'roundabout',
        13: 'harbor',
        14: 'swimming-pool',
        15: 'helicopter'
    }
    catid2name.pop(0)
    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
    return clsid2catid, catid2name


def _vocall_category():
    """
    Get class id to category id map and category id
    to category name map of mixup voc dataset

    """
    label_map = pascalvoc_label()
    label_map = sorted(label_map.items(), key=lambda x: x[1])
    cats = [l[0] for l in label_map]

    clsid2catid = {i: i for i in range(len(cats))}
    catid2name = {i: name for i, name in enumerate(cats)}

    return clsid2catid, catid2name


def _widerface_category():
    label_map = widerface_label()
    label_map = sorted(label_map.items(), key=lambda x: x[1])
    cats = [l[0] for l in label_map]
    clsid2catid = {i: i for i in range(len(cats))}
    catid2name = {i: name for i, name in enumerate(cats)}

    return clsid2catid, catid2name


def _oid19_category():
    clsid2catid = {k: k + 1 for k in range(500)}

    catid2name = {
        0: "background",
        1: "Infant bed",
        2: "Rose",
        3: "Flag",
        4: "Flashlight",
        5: "Sea turtle",
        6: "Camera",
        7: "Animal",
        8: "Glove",
        9: "Crocodile",
        10: "Cattle",
        11: "House",
        12: "Guacamole",
        13: "Penguin",
        14: "Vehicle registration plate",
        15: "Bench",
        16: "Ladybug",
        17: "Human nose",
        18: "Watermelon",
        19: "Flute",
        20: "Butterfly",
        21: "Washing machine",
        22: "Raccoon",
        23: "Segway",
        24: "Taco",
        25: "Jellyfish",
        26: "Cake",
        27: "Pen",
        28: "Cannon",
        29: "Bread",
        30: "Tree",
        31: "Shellfish",
        32: "Bed",
        33: "Hamster",
        34: "Hat",
        35: "Toaster",
        36: "Sombrero",
        37: "Tiara",
        38: "Bowl",
        39: "Dragonfly",
        40: "Moths and butterflies",
        41: "Antelope",
        42: "Vegetable",
        43: "Torch",
        44: "Building",
        45: "Power plugs and sockets",
        46: "Blender",
        47: "Billiard table",
        48: "Cutting board",
        49: "Bronze sculpture",
        50: "Turtle",
        51: "Broccoli",
        52: "Tiger",
        53: "Mirror",
        54: "Bear",
        55: "Zucchini",
        56: "Dress",
        57: "Volleyball",
        58: "Guitar",
        59: "Reptile",
        60: "Golf cart",
        61: "Tart",
        62: "Fedora",
        63: "Carnivore",
        64: "Car",
        65: "Lighthouse",
        66: "Coffeemaker",
        67: "Food processor",
        68: "Truck",
        69: "Bookcase",
        70: "Surfboard",
        71: "Footwear",
        72: "Bench",
        73: "Necklace",
        74: "Flower",
        75: "Radish",
        76: "Marine mammal",
        77: "Frying pan",
        78: "Tap",
        79: "Peach",
        80: "Knife",
        81: "Handbag",
        82: "Laptop",
        83: "Tent",
        84: "Ambulance",
        85: "Christmas tree",
        86: "Eagle",
        87: "Limousine",
        88: "Kitchen & dining room table",
        89: "Polar bear",
        90: "Tower",
        91: "Football",
        92: "Willow",
        93: "Human head",
        94: "Stop sign",
        95: "Banana",
        96: "Mixer",
        97: "Binoculars",
        98: "Dessert",
        99: "Bee",
        100: "Chair",
        101: "Wood-burning stove",
        102: "Flowerpot",
        103: "Beaker",
        104: "Oyster",
        105: "Woodpecker",
        106: "Harp",
        107: "Bathtub",
        108: "Wall clock",
        109: "Sports uniform",
        110: "Rhinoceros",
        111: "Beehive",
        112: "Cupboard",
        113: "Chicken",
        114: "Man",
        115: "Blue jay",
        116: "Cucumber",
        117: "Balloon",
        118: "Kite",
        119: "Fireplace",
        120: "Lantern",
        121: "Missile",
        122: "Book",
        123: "Spoon",
        124: "Grapefruit",
        125: "Squirrel",
        126: "Orange",
        127: "Coat",
        128: "Punching bag",
        129: "Zebra",
        130: "Billboard",
        131: "Bicycle",
        132: "Door handle",
        133: "Mechanical fan",
        134: "Ring binder",
        135: "Table",
        136: "Parrot",
        137: "Sock",
        138: "Vase",
        139: "Weapon",
        140: "Shotgun",
        141: "Glasses",
        142: "Seahorse",
        143: "Belt",
        144: "Watercraft",
        145: "Window",
        146: "Giraffe",
        147: "Lion",
        148: "Tire",
        149: "Vehicle",
        150: "Canoe",
        151: "Tie",
        152: "Shelf",
        153: "Picture frame",
        154: "Printer",
        155: "Human leg",
        156: "Boat",
        157: "Slow cooker",
        158: "Croissant",
        159: "Candle",
        160: "Pancake",
        161: "Pillow",
        162: "Coin",
        163: "Stretcher",
        164: "Sandal",
        165: "Woman",
        166: "Stairs",
        167: "Harpsichord",
        168: "Stool",
        169: "Bus",
        170: "Suitcase",
        171: "Human mouth",
        172: "Juice",
        173: "Skull",
        174: "Door",
        175: "Violin",
        176: "Chopsticks",
        177: "Digital clock",
        178: "Sunflower",
        179: "Leopard",
        180: "Bell pepper",
        181: "Harbor seal",
        182: "Snake",
        183: "Sewing machine",
        184: "Goose",
        185: "Helicopter",
        186: "Seat belt",
        187: "Coffee cup",
        188: "Microwave oven",
        189: "Hot dog",
        190: "Countertop",
        191: "Serving tray",
        192: "Dog bed",
        193: "Beer",
        194: "Sunglasses",
        195: "Golf ball",
        196: "Waffle",
        197: "Palm tree",
        198: "Trumpet",
        199: "Ruler",
        200: "Helmet",
        201: "Ladder",
        202: "Office building",
        203: "Tablet computer",
        204: "Toilet paper",
        205: "Pomegranate",
        206: "Skirt",
        207: "Gas stove",
        208: "Cookie",
        209: "Cart",
        210: "Raven",
        211: "Egg",
        212: "Burrito",
        213: "Goat",
        214: "Kitchen knife",
        215: "Skateboard",
        216: "Salt and pepper shakers",
        217: "Lynx",
        218: "Boot",
        219: "Platter",
        220: "Ski",
        221: "Swimwear",
        222: "Swimming pool",
        223: "Drinking straw",
        224: "Wrench",
        225: "Drum",
        226: "Ant",
        227: "Human ear",
        228: "Headphones",
        229: "Fountain",
        230: "Bird",
        231: "Jeans",
        232: "Television",
        233: "Crab",
        234: "Microphone",
        235: "Home appliance",
        236: "Snowplow",
        237: "Beetle",
        238: "Artichoke",
        239: "Jet ski",
        240: "Stationary bicycle",
        241: "Human hair",
        242: "Brown bear",
        243: "Starfish",
        244: "Fork",
        245: "Lobster",
        246: "Corded phone",
        247: "Drink",
        248: "Saucer",
        249: "Carrot",
        250: "Insect",
        251: "Clock",
        252: "Castle",
        253: "Tennis racket",
        254: "Ceiling fan",
        255: "Asparagus",
        256: "Jaguar",
        257: "Musical instrument",
        258: "Train",
        259: "Cat",
        260: "Rifle",
        261: "Dumbbell",
        262: "Mobile phone",
        263: "Taxi",
        264: "Shower",
        265: "Pitcher",
        266: "Lemon",
        267: "Invertebrate",
        268: "Turkey",
        269: "High heels",
        270: "Bust",
        271: "Elephant",
        272: "Scarf",
        273: "Barrel",
        274: "Trombone",
        275: "Pumpkin",
        276: "Box",
        277: "Tomato",
        278: "Frog",
        279: "Bidet",
        280: "Human face",
        281: "Houseplant",
        282: "Van",
        283: "Shark",
        284: "Ice cream",
        285: "Swim cap",
        286: "Falcon",
        287: "Ostrich",
        288: "Handgun",
        289: "Whiteboard",
        290: "Lizard",
        291: "Pasta",
        292: "Snowmobile",
        293: "Light bulb",
        294: "Window blind",
        295: "Muffin",
        296: "Pretzel",
        297: "Computer monitor",
        298: "Horn",
        299: "Furniture",
        300: "Sandwich",
        301: "Fox",
        302: "Convenience store",
        303: "Fish",
        304: "Fruit",
        305: "Earrings",
        306: "Curtain",
        307: "Grape",
        308: "Sofa bed",
        309: "Horse",
        310: "Luggage and bags",
        311: "Desk",
        312: "Crutch",
        313: "Bicycle helmet",
        314: "Tick",
        315: "Airplane",
        316: "Canary",
        317: "Spatula",
        318: "Watch",
        319: "Lily",
        320: "Kitchen appliance",
        321: "Filing cabinet",
        322: "Aircraft",
        323: "Cake stand",
        324: "Candy",
        325: "Sink",
        326: "Mouse",
        327: "Wine",
        328: "Wheelchair",
        329: "Goldfish",
        330: "Refrigerator",
        331: "French fries",
        332: "Drawer",
        333: "Treadmill",
        334: "Picnic basket",
        335: "Dice",
        336: "Cabbage",
        337: "Football helmet",
        338: "Pig",
        339: "Person",
        340: "Shorts",
        341: "Gondola",
        342: "Honeycomb",
        343: "Doughnut",
        344: "Chest of drawers",
        345: "Land vehicle",
        346: "Bat",
        347: "Monkey",
        348: "Dagger",
        349: "Tableware",
        350: "Human foot",
        351: "Mug",
        352: "Alarm clock",
        353: "Pressure cooker",
        354: "Human hand",
        355: "Tortoise",
        356: "Baseball glove",
        357: "Sword",
        358: "Pear",
        359: "Miniskirt",
        360: "Traffic sign",
        361: "Girl",
        362: "Roller skates",
        363: "Dinosaur",
        364: "Porch",
        365: "Human beard",
        366: "Submarine sandwich",
        367: "Screwdriver",
        368: "Strawberry",
        369: "Wine glass",
        370: "Seafood",
        371: "Racket",
        372: "Wheel",
        373: "Sea lion",
        374: "Toy",
        375: "Tea",
        376: "Tennis ball",
        377: "Waste container",
        378: "Mule",
        379: "Cricket ball",
        380: "Pineapple",
        381: "Coconut",
        382: "Doll",
        383: "Coffee table",
        384: "Snowman",
        385: "Lavender",
        386: "Shrimp",
        387: "Maple",
        388: "Cowboy hat",
        389: "Goggles",
        390: "Rugby ball",
        391: "Caterpillar",
        392: "Poster",
        393: "Rocket",
        394: "Organ",
        395: "Saxophone",
        396: "Traffic light",
        397: "Cocktail",
        398: "Plastic bag",
        399: "Squash",
        400: "Mushroom",
        401: "Hamburger",
        402: "Light switch",
        403: "Parachute",
        404: "Teddy bear",
        405: "Winter melon",
        406: "Deer",
        407: "Musical keyboard",
        408: "Plumbing fixture",
        409: "Scoreboard",
        410: "Baseball bat",
        411: "Envelope",
        412: "Adhesive tape",
        413: "Briefcase",
        414: "Paddle",
        415: "Bow and arrow",
        416: "Telephone",
        417: "Sheep",
        418: "Jacket",
        419: "Boy",
        420: "Pizza",
        421: "Otter",
        422: "Office supplies",
        423: "Couch",
        424: "Cello",
        425: "Bull",
        426: "Camel",
        427: "Ball",
        428: "Duck",
        429: "Whale",
        430: "Shirt",
        431: "Tank",
        432: "Motorcycle",
        433: "Accordion",
        434: "Owl",
        435: "Porcupine",
        436: "Sun hat",
        437: "Nail",
        438: "Scissors",
        439: "Swan",
        440: "Lamp",
        441: "Crown",
        442: "Piano",
        443: "Sculpture",
        444: "Cheetah",
        445: "Oboe",
        446: "Tin can",
        447: "Mango",
        448: "Tripod",
        449: "Oven",
        450: "Mouse",
        451: "Barge",
        452: "Coffee",
        453: "Snowboard",
        454: "Common fig",
        455: "Salad",
        456: "Marine invertebrates",
        457: "Umbrella",
        458: "Kangaroo",
        459: "Human arm",
        460: "Measuring cup",
        461: "Snail",
        462: "Loveseat",
        463: "Suit",
        464: "Teapot",
        465: "Bottle",
        466: "Alpaca",
        467: "Kettle",
        468: "Trousers",
        469: "Popcorn",
        470: "Centipede",
        471: "Spider",
        472: "Sparrow",
        473: "Plate",
        474: "Bagel",
        475: "Personal care",
        476: "Apple",
        477: "Brassiere",
        478: "Bathroom cabinet",
        479: "studio couch",
        480: "Computer keyboard",
        481: "Table tennis racket",
        482: "Sushi",
        483: "Cabinetry",
        484: "Street light",
        485: "Towel",
        486: "Nightstand",
        487: "Rabbit",
        488: "Dolphin",
        489: "Dog",
        490: "Jug",
        491: "Wok",
        492: "Fire hydrant",
        493: "Human eye",
        494: "Skyscraper",
        495: "Backpack",
        496: "Potato",
        497: "Paper towel",
        498: "Lifejacket",
        499: "Bicycle wheel",
        500: "Toilet",
    }

    return clsid2catid, catid2name


def _visdrone_category():
    clsid2catid = {i: i for i in range(10)}

    catid2name = {
        0: 'pedestrian',
        1: 'people',
        2: 'bicycle',
        3: 'car',
        4: 'van',
        5: 'truck',
        6: 'tricycle',
        7: 'awning-tricycle',
        8: 'bus',
        9: 'motor'
    }
    return clsid2catid, catid2name


================================================
FILE: ppdet/data/source/coco.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import copy

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
import numpy as np
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset

from ppdet.utils.logger import setup_logger

logger = setup_logger(__name__)

__all__ = [
    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset', 'COCOInstSegDataset'
]


@register
@serializable
class COCODataSet(DetDataset):
    """
    Load dataset with COCO format.

    Args:
        dataset_dir (str): root directory for dataset.
        image_dir (str): directory for images.
        anno_path (str): coco annotation file path.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        load_crowd (bool): whether to load crowded ground-truth. 
            False as default
        allow_empty (bool): whether to load empty entry. False as default
        empty_ratio (float): the ratio of empty record number to total 
            record's, if empty_ratio is out of [0. ,1.), do not sample the 
            records and use all the empty entries. 1. as default
        repeat (int): repeat times for dataset, use in benchmark.
    """

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 load_crowd=False,
                 allow_empty=False,
                 empty_ratio=1.,
                 repeat=1):
        super(COCODataSet, self).__init__(
            dataset_dir,
            image_dir,
            anno_path,
            data_fields,
            sample_num,
            repeat=repeat)
        self.load_image_only = False
        self.load_semantic = False
        self.load_crowd = load_crowd
        self.allow_empty = allow_empty
        self.empty_ratio = empty_ratio

    def _sample_empty(self, records, num):
        # if empty_ratio is out of [0. ,1.), do not sample the records
        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
            return records
        import random
        sample_num = min(
            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
        records = random.sample(records, sample_num)
        return records

    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        assert anno_path.endswith('.json'), \
            'invalid coco annotation file: ' + anno_path
        from pycocotools.coco import COCO
        coco = COCO(anno_path)
        img_ids = coco.getImgIds()
        img_ids.sort()
        cat_ids = coco.getCatIds()
        records = []
        empty_records = []
        ct = 0

        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
        self.cname2cid = dict({
            coco.loadCats(catid)[0]['name']: clsid
            for catid, clsid in self.catid2clsid.items()
        })

        if 'annotations' not in coco.dataset:
            self.load_image_only = True
            logger.warning('Annotation file: {} does not contains ground truth '
                           'and load image information only.'.format(anno_path))

        for img_id in img_ids:
            img_anno = coco.loadImgs([img_id])[0]
            im_fname = img_anno['file_name']
            im_w = float(img_anno['width'])
            im_h = float(img_anno['height'])

            im_path = os.path.join(image_dir,
                                   im_fname) if image_dir else im_fname
            is_empty = False
            if not os.path.exists(im_path):
                logger.warning('Illegal image file: {}, and it will be '
                               'ignored'.format(im_path))
                continue

            if im_w < 0 or im_h < 0:
                logger.warning('Illegal width: {} or height: {} in annotation, '
                               'and im_id: {} will be ignored'.format(
                    im_w, im_h, img_id))
                continue

            coco_rec = {
                'im_file': im_path,
                'im_id': np.array([img_id]),
                'h': im_h,
                'w': im_w,
            } if 'image' in self.data_fields else {}

            if not self.load_image_only:
                ins_anno_ids = coco.getAnnIds(
                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
                instances = coco.loadAnns(ins_anno_ids)

                bboxes = []
                is_rbox_anno = False
                for inst in instances:
                    # check gt bbox
                    if inst.get('ignore', False):
                        continue
                    if 'bbox' not in inst.keys():
                        continue
                    else:
                        if not any(np.array(inst['bbox'])):
                            continue

                    x1, y1, box_w, box_h = inst['bbox']
                    x2 = x1 + box_w
                    y2 = y1 + box_h
                    eps = 1e-5
                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
                        inst['clean_bbox'] = [
                            round(float(x), 3) for x in [x1, y1, x2, y2]
                        ]
                        bboxes.append(inst)
                    else:
                        logger.warning(
                            'Found an invalid bbox in annotations: im_id: {}, '
                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                                img_id, float(inst['area']), x1, y1, x2, y2))

                num_bbox = len(bboxes)
                if num_bbox <= 0 and not self.allow_empty:
                    continue
                elif num_bbox <= 0:
                    is_empty = True

                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                gt_poly = [None] * num_bbox
                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)

                has_segmentation = False
                has_track_id = False
                for i, box in enumerate(bboxes):
                    catid = box['category_id']
                    gt_class[i][0] = self.catid2clsid[catid]
                    gt_bbox[i, :] = box['clean_bbox']
                    is_crowd[i][0] = box['iscrowd']
                    # check RLE format 
                    if 'segmentation' in box and box['iscrowd'] == 1:
                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                    elif 'segmentation' in box and box['segmentation']:
                        if not np.array(
                                box['segmentation'],
                                dtype=object).size > 0 and not self.allow_empty:
                            bboxes.pop(i)
                            gt_poly.pop(i)
                            np.delete(is_crowd, i)
                            np.delete(gt_class, i)
                            np.delete(gt_bbox, i)
                        else:
                            gt_poly[i] = box['segmentation']
                        has_segmentation = True

                    if 'track_id' in box:
                        gt_track_id[i][0] = box['track_id']
                        has_track_id = True

                if has_segmentation and not any(
                        gt_poly) and not self.allow_empty:
                    continue

                gt_rec = {
                    'is_crowd': is_crowd,
                    'gt_class': gt_class,
                    'gt_bbox': gt_bbox,
                    'gt_poly': gt_poly,
                }
                if has_track_id:
                    gt_rec.update({'gt_track_id': gt_track_id})

                for k, v in gt_rec.items():
                    if k in self.data_fields:
                        coco_rec[k] = v

                # TODO: remove load_semantic
                if self.load_semantic and 'semantic' in self.data_fields:
                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
                                            'train2017', im_fname[:-3] + 'png')
                    coco_rec.update({'semantic': seg_path})

            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
                im_path, img_id, im_h, im_w))
            if is_empty:
                empty_records.append(coco_rec)
            else:
                records.append(coco_rec)
            ct += 1
            if self.sample_num > 0 and ct >= self.sample_num:
                break
        assert ct > 0, 'not found any coco record in %s' % (anno_path)
        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
                    format(ct, len(img_ids) - ct, anno_path))
        if self.allow_empty and len(empty_records) > 0:
            empty_records = self._sample_empty(empty_records, len(records))
            records += empty_records
        self.roidbs = records


@register
@serializable
class SlicedCOCODataSet(COCODataSet):
    """Sliced COCODataSet"""

    def __init__(
            self,
            dataset_dir=None,
            image_dir=None,
            anno_path=None,
            data_fields=['image'],
            sample_num=-1,
            load_crowd=False,
            allow_empty=False,
            empty_ratio=1.,
            repeat=1,
            sliced_size=[640, 640],
            overlap_ratio=[0.25, 0.25], ):
        super(SlicedCOCODataSet, self).__init__(
            dataset_dir=dataset_dir,
            image_dir=image_dir,
            anno_path=anno_path,
            data_fields=data_fields,
            sample_num=sample_num,
            load_crowd=load_crowd,
            allow_empty=allow_empty,
            empty_ratio=empty_ratio,
            repeat=repeat, )
        self.sliced_size = sliced_size
        self.overlap_ratio = overlap_ratio

    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        assert anno_path.endswith('.json'), \
            'invalid coco annotation file: ' + anno_path
        from pycocotools.coco import COCO
        coco = COCO(anno_path)
        img_ids = coco.getImgIds()
        img_ids.sort()
        cat_ids = coco.getCatIds()
        records = []
        empty_records = []
        ct = 0
        ct_sub = 0

        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
        self.cname2cid = dict({
            coco.loadCats(catid)[0]['name']: clsid
            for catid, clsid in self.catid2clsid.items()
        })

        if 'annotations' not in coco.dataset:
            self.load_image_only = True
            logger.warning('Annotation file: {} does not contains ground truth '
                           'and load image information only.'.format(anno_path))
        try:
            import sahi
            from sahi.slicing import slice_image
        except Exception as e:
            logger.error(
                'sahi not found, plaese install sahi. '
                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
            )
            raise e

        sub_img_ids = 0
        for img_id in img_ids:
            img_anno = coco.loadImgs([img_id])[0]
            im_fname = img_anno['file_name']
            im_w = float(img_anno['width'])
            im_h = float(img_anno['height'])

            im_path = os.path.join(image_dir,
                                   im_fname) if image_dir else im_fname
            is_empty = False
            if not os.path.exists(im_path):
                logger.warning('Illegal image file: {}, and it will be '
                               'ignored'.format(im_path))
                continue

            if im_w < 0 or im_h < 0:
                logger.warning('Illegal width: {} or height: {} in annotation, '
                               'and im_id: {} will be ignored'.format(
                    im_w, im_h, img_id))
                continue

            slice_image_result = sahi.slicing.slice_image(
                image=im_path,
                slice_height=self.sliced_size[0],
                slice_width=self.sliced_size[1],
                overlap_height_ratio=self.overlap_ratio[0],
                overlap_width_ratio=self.overlap_ratio[1])

            sub_img_num = len(slice_image_result)
            for _ind in range(sub_img_num):
                im = slice_image_result.images[_ind]
                coco_rec = {
                    'image': im,
                    'im_id': np.array([sub_img_ids + _ind]),
                    'h': im.shape[0],
                    'w': im.shape[1],
                    'ori_im_id': np.array([img_id]),
                    'st_pix': np.array(
                        slice_image_result.starting_pixels[_ind],
                        dtype=np.float32),
                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
                } if 'image' in self.data_fields else {}
                records.append(coco_rec)
            ct_sub += sub_img_num
            ct += 1
            if self.sample_num > 0 and ct >= self.sample_num:
                break
        assert ct > 0, 'not found any coco record in %s' % (anno_path)
        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
            ct, ct_sub, anno_path))
        if self.allow_empty and len(empty_records) > 0:
            empty_records = self._sample_empty(empty_records, len(records))
            records += empty_records
        self.roidbs = records


@register
@serializable
class SemiCOCODataSet(COCODataSet):
    """Semi-COCODataSet used for supervised and unsupervised dataSet"""

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 load_crowd=False,
                 allow_empty=False,
                 empty_ratio=1.,
                 repeat=1,
                 supervised=True):
        super(SemiCOCODataSet, self).__init__(
            dataset_dir, image_dir, anno_path, data_fields, sample_num,
            load_crowd, allow_empty, empty_ratio, repeat)
        self.supervised = supervised
        self.length = -1  # defalut -1 means all

    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        assert anno_path.endswith('.json'), \
            'invalid coco annotation file: ' + anno_path
        from pycocotools.coco import COCO
        coco = COCO(anno_path)
        img_ids = coco.getImgIds()
        img_ids.sort()
        cat_ids = coco.getCatIds()
        records = []
        empty_records = []
        ct = 0

        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
        self.cname2cid = dict({
            coco.loadCats(catid)[0]['name']: clsid
            for catid, clsid in self.catid2clsid.items()
        })

        if 'annotations' not in coco.dataset or self.supervised == False:
            self.load_image_only = True
            logger.warning('Annotation file: {} does not contains ground truth '
                           'and load image information only.'.format(anno_path))

        for img_id in img_ids:
            img_anno = coco.loadImgs([img_id])[0]
            im_fname = img_anno['file_name']
            im_w = float(img_anno['width'])
            im_h = float(img_anno['height'])

            im_path = os.path.join(image_dir,
                                   im_fname) if image_dir else im_fname
            is_empty = False
            if not os.path.exists(im_path):
                logger.warning('Illegal image file: {}, and it will be '
                               'ignored'.format(im_path))
                continue

            if im_w < 0 or im_h < 0:
                logger.warning('Illegal width: {} or height: {} in annotation, '
                               'and im_id: {} will be ignored'.format(
                    im_w, im_h, img_id))
                continue

            coco_rec = {
                'im_file': im_path,
                'im_id': np.array([img_id]),
                'h': im_h,
                'w': im_w,
            } if 'image' in self.data_fields else {}

            if not self.load_image_only:
                ins_anno_ids = coco.getAnnIds(
                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
                instances = coco.loadAnns(ins_anno_ids)

                bboxes = []
                is_rbox_anno = False
                for inst in instances:
                    # check gt bbox
                    if inst.get('ignore', False):
                        continue
                    if 'bbox' not in inst.keys():
                        continue
                    else:
                        if not any(np.array(inst['bbox'])):
                            continue

                    x1, y1, box_w, box_h = inst['bbox']
                    x2 = x1 + box_w
                    y2 = y1 + box_h
                    eps = 1e-5
                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
                        inst['clean_bbox'] = [
                            round(float(x), 3) for x in [x1, y1, x2, y2]
                        ]
                        bboxes.append(inst)
                    else:
                        logger.warning(
                            'Found an invalid bbox in annotations: im_id: {}, '
                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                                img_id, float(inst['area']), x1, y1, x2, y2))

                num_bbox = len(bboxes)
                if num_bbox <= 0 and not self.allow_empty:
                    continue
                elif num_bbox <= 0:
                    is_empty = True

                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                gt_poly = [None] * num_bbox

                has_segmentation = False
                for i, box in enumerate(bboxes):
                    catid = box['category_id']
                    gt_class[i][0] = self.catid2clsid[catid]
                    gt_bbox[i, :] = box['clean_bbox']
                    is_crowd[i][0] = box['iscrowd']
                    # check RLE format 
                    if 'segmentation' in box and box['iscrowd'] == 1:
                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                    elif 'segmentation' in box and box['segmentation']:
                        if not np.array(box['segmentation']
                                        ).size > 0 and not self.allow_empty:
                            bboxes.pop(i)
                            gt_poly.pop(i)
                            np.delete(is_crowd, i)
                            np.delete(gt_class, i)
                            np.delete(gt_bbox, i)
                        else:
                            gt_poly[i] = box['segmentation']
                        has_segmentation = True

                if has_segmentation and not any(
                        gt_poly) and not self.allow_empty:
                    continue

                gt_rec = {
                    'is_crowd': is_crowd,
                    'gt_class': gt_class,
                    'gt_bbox': gt_bbox,
                    'gt_poly': gt_poly,
                }

                for k, v in gt_rec.items():
                    if k in self.data_fields:
                        coco_rec[k] = v

                # TODO: remove load_semantic
                if self.load_semantic and 'semantic' in self.data_fields:
                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
                                            'train2017', im_fname[:-3] + 'png')
                    coco_rec.update({'semantic': seg_path})

            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
                im_path, img_id, im_h, im_w))
            if is_empty:
                empty_records.append(coco_rec)
            else:
                records.append(coco_rec)
            ct += 1
            if self.sample_num > 0 and ct >= self.sample_num:
                break
        assert ct > 0, 'not found any coco record in %s' % (anno_path)
        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
                    format(ct, len(img_ids) - ct, anno_path))
        if self.allow_empty and len(empty_records) > 0:
            empty_records = self._sample_empty(empty_records, len(records))
            records += empty_records
        self.roidbs = records

        if self.supervised:
            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
        else:
            if self.length > 0:  # unsup length will be decide by sup length
                all_roidbs = self.roidbs.copy()
                selected_idxs = [
                    np.random.choice(len(all_roidbs))
                    for _ in range(self.length)
                ]
                self.roidbs = [all_roidbs[i] for i in selected_idxs]
            logger.info(
                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')

    def __getitem__(self, idx):
        n = len(self.roidbs)
        if self.repeat > 1:
            idx %= n
        # data batch
        roidb = copy.deepcopy(self.roidbs[idx])
        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
            idx = np.random.randint(n)
            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
            idx = np.random.randint(n)
            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
            roidb = [roidb, ] + [
                copy.deepcopy(self.roidbs[np.random.randint(n)])
                for _ in range(4)
            ]
        if isinstance(roidb, Sequence):
            for r in roidb:
                r['curr_iter'] = self._curr_iter
        else:
            roidb['curr_iter'] = self._curr_iter
        self._curr_iter += 1

        return self.transform(roidb)


# for PaddleX
@register
@serializable
class COCODetDataset(COCODataSet):
    pass


# for PaddleX
@register
@serializable
class COCOInstSegDataset(COCODataSet):
    pass


================================================
FILE: ppdet/data/source/culane.py
================================================
from ppdet.core.workspace import register, serializable
import cv2
import os
import tarfile
import numpy as np
import os.path as osp
from ppdet.data.source.dataset import DetDataset
from imgaug.augmentables.lines import LineStringsOnImage
from imgaug.augmentables.segmaps import SegmentationMapsOnImage
from ppdet.data.culane_utils import lane_to_linestrings
import pickle as pkl
from ppdet.utils.logger import setup_logger
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
from .dataset import DetDataset, _make_dataset, _is_valid_file
from ppdet.utils.download import download_dataset

logger = setup_logger(__name__)


@register
@serializable
class CULaneDataSet(DetDataset):
    def __init__(
            self,
            dataset_dir,
            cut_height,
            list_path,
            split='train',
            data_fields=['image'],
            video_file=None,
            frame_rate=-1, ):
        super(CULaneDataSet, self).__init__(
            dataset_dir=dataset_dir,
            cut_height=cut_height,
            split=split,
            data_fields=data_fields)
        self.dataset_dir = dataset_dir
        self.list_path = osp.join(dataset_dir, list_path)
        self.cut_height = cut_height
        self.data_fields = data_fields
        self.split = split
        self.training = 'train' in split
        self.data_infos = []
        self.video_file = video_file
        self.frame_rate = frame_rate
        self._imid2path = {}
        self.predict_dir = None

    def __len__(self):
        return len(self.data_infos)

    def check_or_download_dataset(self):
        if not osp.exists(self.dataset_dir):
            download_dataset("dataset", dataset="culane")
            # extract .tar files in self.dataset_dir
            for fname in os.listdir(self.dataset_dir):
                logger.info("Decompressing {}...".format(fname))
                # ignore .* files
                if fname.startswith('.'):
                    continue
                if fname.find('.tar.gz') >= 0:
                    with tarfile.open(osp.join(self.dataset_dir, fname)) as tf:
                        tf.extractall(path=self.dataset_dir)
        logger.info("Dataset files are ready.")

    def parse_dataset(self):
        logger.info('Loading CULane annotations...')
        if self.predict_dir is not None:
            logger.info('switch to predict mode')
            return
        # Waiting for the dataset to load is tedious, let's cache it
        os.makedirs('cache', exist_ok=True)
        cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split)
        if os.path.exists(cache_path):
            with open(cache_path, 'rb') as cache_file:
                self.data_infos = pkl.load(cache_file)
                self.max_lanes = max(
                    len(anno['lanes']) for anno in self.data_infos)
                return

        with open(self.list_path) as list_file:
            for line in list_file:
                infos = self.load_annotation(line.split())
                self.data_infos.append(infos)

        # cache data infos to file
        with open(cache_path, 'wb') as cache_file:
            pkl.dump(self.data_infos, cache_file)

    def load_annotation(self, line):
        infos = {}
        img_line = line[0]
        img_line = img_line[1 if img_line[0] == '/' else 0::]
        img_path = os.path.join(self.dataset_dir, img_line)
        infos['img_name'] = img_line
        infos['img_path'] = img_path
        if len(line) > 1:
            mask_line = line[1]
            mask_line = mask_line[1 if mask_line[0] == '/' else 0::]
            mask_path = os.path.join(self.dataset_dir, mask_line)
            infos['mask_path'] = mask_path

        if len(line) > 2:
            exist_list = [int(l) for l in line[2:]]
            infos['lane_exist'] = np.array(exist_list)

        anno_path = img_path[:
                             -3] + 'lines.txt'  # remove sufix jpg and add lines.txt
        with open(anno_path, 'r') as anno_file:
            data = [
                list(map(float, line.split())) for line in anno_file.readlines()
            ]
        lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)
                  if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data]
        lanes = [list(set(lane)) for lane in lanes]  # remove duplicated points
        lanes = [lane for lane in lanes
                 if len(lane) > 2]  # remove lanes with less than 2 points

        lanes = [sorted(
            lane, key=lambda x: x[1]) for lane in lanes]  # sort by y
        infos['lanes'] = lanes

        return infos

    def set_images(self, images):
        self.predict_dir = images
        self.data_infos = self._load_images()

    def _find_images(self):
        predict_dir = self.predict_dir
        if not isinstance(predict_dir, Sequence):
            predict_dir = [predict_dir]
        images = []
        for im_dir in predict_dir:
            if os.path.isdir(im_dir):
                im_dir = os.path.join(self.predict_dir, im_dir)
                images.extend(_make_dataset(im_dir))
            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
                images.append(im_dir)
        return images

    def _load_images(self):
        images = self._find_images()
        ct = 0
        records = []
        for image in images:
            assert image != '' and os.path.isfile(image), \
                    "Image {} not found".format(image)
            if self.sample_num > 0 and ct >= self.sample_num:
                break
            rec = {
                'im_id': np.array([ct]),
                "img_path": os.path.abspath(image),
                "img_name": os.path.basename(image),
                "lanes": []
            }
            self._imid2path[ct] = image
            ct += 1
            records.append(rec)
        assert len(records) > 0, "No image file found"
        return records

    def get_imid2path(self):
        return self._imid2path

    def __getitem__(self, idx):
        data_info = self.data_infos[idx]
        img = cv2.imread(data_info['img_path'])
        img = img[self.cut_height:, :, :]
        sample = data_info.copy()
        sample.update({'image': img})
        img_org = sample['image']

        if self.training:
            label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED)
            if len(label.shape) > 2:
                label = label[:, :, 0]
            label = label.squeeze()
            label = label[self.cut_height:, :]
            sample.update({'mask': label})
            if self.cut_height != 0:
                new_lanes = []
                for i in sample['lanes']:
                    lanes = []
                    for p in i:
                        lanes.append((p[0], p[1] - self.cut_height))
                    new_lanes.append(lanes)
                sample.update({'lanes': new_lanes})

            sample['mask'] = SegmentationMapsOnImage(
                sample['mask'], shape=img_org.shape)

        sample['full_img_path'] = data_info['img_path']
        sample['img_name'] = data_info['img_name']
        sample['im_id'] = np.array([idx])

        sample['image'] = sample['image'].copy().astype(np.uint8)
        sample['lanes'] = lane_to_linestrings(sample['lanes'])
        sample['lanes'] = LineStringsOnImage(
            sample['lanes'], shape=img_org.shape)
        sample['seg'] = np.zeros(img_org.shape)

        return sample


================================================
FILE: ppdet/data/source/dataset.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import copy
import numpy as np
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
from pycocotools.coco import COCO
from paddle.io import Dataset
from ppdet.core.workspace import register, serializable
from ppdet.utils.download import get_dataset_path
from ppdet.data import source

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@serializable
class DetDataset(Dataset):
    """
    Load detection dataset.

    Args:
        dataset_dir (str): root directory for dataset.
        image_dir (str): directory for images.
        anno_path (str): annotation file path.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        use_default_label (bool): whether to load default label list.
        repeat (int): repeat times for dataset, use in benchmark.
    """

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 use_default_label=None,
                 repeat=1,
                 **kwargs):
        super(DetDataset, self).__init__()
        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
        self.anno_path = anno_path
        self.image_dir = image_dir if image_dir is not None else ''
        self.data_fields = data_fields
        self.sample_num = sample_num
        self.use_default_label = use_default_label
        self.repeat = repeat
        self._epoch = 0
        self._curr_iter = 0

    def __len__(self, ):
        return len(self.roidbs) * self.repeat

    def __call__(self, *args, **kwargs):
        return self

    def __getitem__(self, idx):
        n = len(self.roidbs)
        if self.repeat > 1:
            idx %= n
        # data batch
        roidb = copy.deepcopy(self.roidbs[idx])
        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
            idx = np.random.randint(n)
            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
            idx = np.random.randint(n)
            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
            roidb = [roidb, ] + [
                copy.deepcopy(self.roidbs[np.random.randint(n)])
                for _ in range(4)
            ]
        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
            # Add previous image as input, only used in CenterTrack
            idx_pre_img = idx - 1
            if idx_pre_img < 0:
                idx_pre_img = idx + 1
            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
        if isinstance(roidb, Sequence):
            for r in roidb:
                r['curr_iter'] = self._curr_iter
                r['curr_epoch'] = self._epoch
        else:
            roidb['curr_iter'] = self._curr_iter
            roidb['curr_epoch'] = self._epoch
        self._curr_iter += 1
        
        if self.transform_schedulers:
            assert isinstance(self.transform_schedulers, list)
            if isinstance(roidb, Sequence):
                for r in roidb:
                    r['transform_schedulers'] = self.transform_schedulers
            else:
                roidb['transform_schedulers'] = self.transform_schedulers
        
        return self.transform(roidb)

    def check_or_download_dataset(self):
        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
                                            self.image_dir)

    def set_kwargs(self, **kwargs):
        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
        self.transform_schedulers = kwargs.get('transform_schedulers', None)

    def set_transform(self, transform):
        self.transform = transform

    def set_epoch(self, epoch_id):
        self._epoch = epoch_id

    def parse_dataset(self, ):
        raise NotImplementedError(
            "Need to implement parse_dataset method of Dataset")

    def get_anno(self):
        if self.anno_path is None:
            return
        return os.path.join(self.dataset_dir, self.anno_path)


def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
    return f.lower().endswith(extensions)


def _make_dataset(dir):
    dir = os.path.expanduser(dir)
    if not os.path.isdir(dir):
        raise ('{} should be a dir'.format(dir))
    images = []
    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
        for fname in sorted(fnames):
            path = os.path.join(root, fname)
            if _is_valid_file(path):
                images.append(path)
    return images


@register
@serializable
class ImageFolder(DetDataset):
    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 sample_num=-1,
                 use_default_label=None,
                 **kwargs):
        super(ImageFolder, self).__init__(
            dataset_dir,
            image_dir,
            anno_path,
            sample_num=sample_num,
            use_default_label=use_default_label)
        self._imid2path = {}
        self.roidbs = None
        self.sample_num = sample_num

    def check_or_download_dataset(self):
        return

    def get_anno(self):
        if self.anno_path is None:
            return
        if self.dataset_dir:
            return os.path.join(self.dataset_dir, self.anno_path)
        else:
            return self.anno_path

    def parse_dataset(self, ):
        if not self.roidbs:
            self.roidbs = self._load_images()

    def _parse(self):
        image_dir = self.image_dir
        if not isinstance(image_dir, Sequence):
            image_dir = [image_dir]
        images = []
        for im_dir in image_dir:
            if os.path.isdir(im_dir):
                im_dir = os.path.join(self.dataset_dir, im_dir)
                images.extend(_make_dataset(im_dir))
            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
                images.append(im_dir)
        return images
    
    def get_images(self):
        images_path = []
        coco = COCO(os.path.join(self.dataset_dir, self.anno_path))
        imgIds = coco.getImgIds(catIds=[])
        for imgId in imgIds:
            filename = coco.loadImgs(imgId)[0]["file_name"]
            images_path.append(os.path.join(self.dataset_dir, self.image_dir, filename))
        return images_path

    def _load_images(self, do_eval=False):
        images = self._parse()
        ct = 0
        records = []
        anno_file = self.get_anno()
        coco = COCO(anno_file)
        for image in images:
            assert image != '' and os.path.isfile(image), \
                    "Image {} not found".format(image)
            if self.sample_num > 0 and ct >= self.sample_num:
                break
            if do_eval:
                image_id = self.get_image_id(image, coco)
                ct = image_id
            rec = {'im_id': np.array([ct]), 'im_file': image}
            self._imid2path[ct] = image
            ct += 1
            records.append(rec)
        assert len(records) > 0, "No image file found"
        return records
    
    def get_image_id(self, image, coco):
        image_ids = coco.getImgIds()
        for image_id in image_ids:
            img_info = coco.loadImgs(image_id)[0]
            if img_info['file_name'] in image:
                return image_id
            else:
                continue

    def get_imid2path(self):
        return self._imid2path

    def set_images(self, images, do_eval=False):
        self.image_dir = images
        self.roidbs = self._load_images(do_eval=do_eval)

    def set_slice_images(self,
                         images,
                         slice_size=[640, 640],
                         overlap_ratio=[0.25, 0.25]):
        self.image_dir = images
        ori_records = self._load_images()
        try:
            import sahi
            from sahi.slicing import slice_image
        except Exception as e:
            logger.error(
                'sahi not found, plaese install sahi. '
                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
            )
            raise e

        sub_img_ids = 0
        ct = 0
        ct_sub = 0
        records = []
        for i, ori_rec in enumerate(ori_records):
            im_path = ori_rec['im_file']
            slice_image_result = sahi.slicing.slice_image(
                image=im_path,
                slice_height=slice_size[0],
                slice_width=slice_size[1],
                overlap_height_ratio=overlap_ratio[0],
                overlap_width_ratio=overlap_ratio[1])

            sub_img_num = len(slice_image_result)
            for _ind in range(sub_img_num):
                im = slice_image_result.images[_ind]
                rec = {
                    'image': im,
                    'im_id': np.array([sub_img_ids + _ind]),
                    'h': im.shape[0],
                    'w': im.shape[1],
                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
                    'st_pix': np.array(
                        slice_image_result.starting_pixels[_ind],
                        dtype=np.float32),
                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
                } if 'image' in self.data_fields else {}
                records.append(rec)
            ct_sub += sub_img_num
            ct += 1
        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
                                                                     ct_sub))
        self.roidbs = records

    def get_label_list(self):
        # Only VOC dataset needs label list in ImageFold 
        return self.anno_path


@register
class CommonDataset(object):
    def __init__(self, **dataset_args):
        super(CommonDataset, self).__init__()
        dataset_args = copy.deepcopy(dataset_args)
        type = dataset_args.pop("name")
        self.dataset = getattr(source, type)(**dataset_args)

    def __call__(self):
        return self.dataset


@register
class TrainDataset(CommonDataset):
    pass


@register
class EvalMOTDataset(CommonDataset):
    pass


@register
class TestMOTDataset(CommonDataset):
    pass


@register
class EvalDataset(CommonDataset):
    pass


@register
class TestDataset(CommonDataset):
    pass


================================================
FILE: ppdet/data/source/keypoint_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
"""
this code is base on https://github.com/open-mmlab/mmpose
"""
import os
import cv2
import numpy as np
import json
import copy
import pycocotools
from pycocotools.coco import COCO
from .dataset import DetDataset
from ppdet.core.workspace import register, serializable


@serializable
class KeypointBottomUpBaseDataset(DetDataset):
    """Base class for bottom-up datasets. 

    All datasets should subclass it.
    All subclasses should overwrite:
        Methods:`_get_imganno`

    Args:
        dataset_dir (str): Root path to the dataset.
        anno_path (str): Relative path to the annotation file.
        image_dir (str): Path to a directory where images are held.
            Default: None.
        num_joints (int): keypoint numbers
        transform (composed(operators)): A sequence of data transforms.
        shard (list): [rank, worldsize], the distributed env params
        test_mode (bool): Store True when building test or
            validation dataset. Default: False.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 transform=[],
                 shard=[0, 1],
                 test_mode=False):
        super().__init__(dataset_dir, image_dir, anno_path)
        self.image_info = {}
        self.ann_info = {}

        self.img_prefix = os.path.join(dataset_dir, image_dir)
        self.transform = transform
        self.test_mode = test_mode

        self.ann_info['num_joints'] = num_joints
        self.img_ids = []

    def parse_dataset(self):
        pass

    def __len__(self):
        """Get dataset length."""
        return len(self.img_ids)

    def _get_imganno(self, idx):
        """Get anno for a single image."""
        raise NotImplementedError

    def __getitem__(self, idx):
        """Prepare image for training given the index."""
        records = copy.deepcopy(self._get_imganno(idx))
        records['image'] = cv2.imread(records['image_file'])
        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
        if 'mask' in records:
            records['mask'] = (records['mask'] + 0).astype('uint8')
        records = self.transform(records)
        return records

    def parse_dataset(self):
        return


@register
@serializable
class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
    """COCO dataset for bottom-up pose estimation. 

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    COCO keypoint indexes::

        0: 'nose',
        1: 'left_eye',
        2: 'right_eye',
        3: 'left_ear',
        4: 'right_ear',
        5: 'left_shoulder',
        6: 'right_shoulder',
        7: 'left_elbow',
        8: 'right_elbow',
        9: 'left_wrist',
        10: 'right_wrist',
        11: 'left_hip',
        12: 'right_hip',
        13: 'left_knee',
        14: 'right_knee',
        15: 'left_ankle',
        16: 'right_ankle'

    Args:
        dataset_dir (str): Root path to the dataset.
        anno_path (str): Relative path to the annotation file.
        image_dir (str): Path to a directory where images are held.
            Default: None.
        num_joints (int): keypoint numbers
        transform (composed(operators)): A sequence of data transforms.
        shard (list): [rank, worldsize], the distributed env params
        test_mode (bool): Store True when building test or
            validation dataset. Default: False.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 transform=[],
                 shard=[0, 1],
                 test_mode=False,
                 return_mask=True,
                 return_bbox=True,
                 return_area=True,
                 return_class=True):
        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                         transform, shard, test_mode)

        self.ann_file = os.path.join(dataset_dir, anno_path)
        self.shard = shard
        self.test_mode = test_mode
        self.return_mask = return_mask
        self.return_bbox = return_bbox
        self.return_area = return_area
        self.return_class = return_class

    def parse_dataset(self):
        self.coco = COCO(self.ann_file)

        self.img_ids = self.coco.getImgIds()
        if not self.test_mode:
            self.img_ids_tmp = []
            for img_id in self.img_ids:
                ann_ids = self.coco.getAnnIds(imgIds=img_id)
                anno = self.coco.loadAnns(ann_ids)
                anno = [obj for obj in anno if obj['iscrowd'] == 0]
                if len(anno) == 0:
                    continue
                self.img_ids_tmp.append(img_id)
            self.img_ids = self.img_ids_tmp

        blocknum = int(len(self.img_ids) / self.shard[1])
        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
            self.shard[0] + 1))]
        self.num_images = len(self.img_ids)
        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
        self.dataset_name = 'coco'

        cat_ids = self.coco.getCatIds()
        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
        print('=> num_images: {}'.format(self.num_images))

    @staticmethod
    def _get_mapping_id_name(imgs):
        """
        Args:
            imgs (dict): dict of image info.

        Returns:
            tuple: Image name & id mapping dicts.

            - id2name (dict): Mapping image id to name.
            - name2id (dict): Mapping image name to id.
        """
        id2name = {}
        name2id = {}
        for image_id, image in imgs.items():
            file_name = image['file_name']
            id2name[image_id] = file_name
            name2id[file_name] = image_id

        return id2name, name2id

    def _get_imganno(self, idx):
        """Get anno for a single image.

        Args:
            idx (int): image idx

        Returns:
            dict: info for model training
        """
        coco = self.coco
        img_id = self.img_ids[idx]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anno = coco.loadAnns(ann_ids)

        anno = [
            obj for obj in anno
            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
        ]

        db_rec = {}
        joints, orgsize = self._get_joints(anno, idx)
        db_rec['gt_joints'] = joints
        db_rec['im_shape'] = orgsize

        if self.return_bbox:
            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)

        if self.return_class:
            db_rec['gt_class'] = self._get_labels(anno, idx)

        if self.return_area:
            db_rec['gt_areas'] = self._get_areas(anno, idx)

        if self.return_mask:
            db_rec['mask'] = self._get_mask(anno, idx)

        db_rec['im_id'] = img_id
        db_rec['image_file'] = os.path.join(self.img_prefix,
                                            self.id2name[img_id])

        return db_rec

    def _get_joints(self, anno, idx):
        """Get joints for all people in an image."""
        num_people = len(anno)

        joints = np.zeros(
            (num_people, self.ann_info['num_joints'], 3), dtype=np.float32)

        for i, obj in enumerate(anno):
            joints[i, :self.ann_info['num_joints'], :3] = \
                np.array(obj['keypoints']).reshape([-1, 3])

        img_info = self.coco.loadImgs(self.img_ids[idx])[0]
        orgsize = np.array([img_info['height'], img_info['width'], 1])

        return joints, orgsize

    def _get_bboxs(self, anno, idx):
        num_people = len(anno)
        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)

        for idx, obj in enumerate(anno):
            if 'bbox' in obj:
                gt_bboxes[idx, :] = obj['bbox']

        gt_bboxes[:, 2] += gt_bboxes[:, 0]
        gt_bboxes[:, 3] += gt_bboxes[:, 1]
        return gt_bboxes

    def _get_labels(self, anno, idx):
        num_people = len(anno)
        gt_labels = np.zeros((num_people, 1), dtype=np.float32)

        for idx, obj in enumerate(anno):
            if 'category_id' in obj:
                catid = obj['category_id']
                gt_labels[idx, 0] = self.catid2clsid[catid]
        return gt_labels

    def _get_areas(self, anno, idx):
        num_people = len(anno)
        gt_areas = np.zeros((num_people, ), dtype=np.float32)

        for idx, obj in enumerate(anno):
            if 'area' in obj:
                gt_areas[idx, ] = obj['area']
        return gt_areas

    def _get_mask(self, anno, idx):
        """Get ignore masks to mask out losses."""
        coco = self.coco
        img_info = coco.loadImgs(self.img_ids[idx])[0]

        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)

        for obj in anno:
            if 'segmentation' in obj:
                if obj['iscrowd']:
                    rle = pycocotools.mask.frPyObjects(obj['segmentation'],
                                                       img_info['height'],
                                                       img_info['width'])
                    m += pycocotools.mask.decode(rle)
                elif obj['num_keypoints'] == 0:
                    rles = pycocotools.mask.frPyObjects(obj['segmentation'],
                                                        img_info['height'],
                                                        img_info['width'])
                    for rle in rles:
                        m += pycocotools.mask.decode(rle)

        return m < 0.5


@register
@serializable
class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
    """CrowdPose dataset for bottom-up pose estimation. 

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    CrowdPose keypoint indexes::

        0: 'left_shoulder',
        1: 'right_shoulder',
        2: 'left_elbow',
        3: 'right_elbow',
        4: 'left_wrist',
        5: 'right_wrist',
        6: 'left_hip',
        7: 'right_hip',
        8: 'left_knee',
        9: 'right_knee',
        10: 'left_ankle',
        11: 'right_ankle',
        12: 'top_head',
        13: 'neck'

    Args:
        dataset_dir (str): Root path to the dataset.
        anno_path (str): Relative path to the annotation file.
        image_dir (str): Path to a directory where images are held.
            Default: None.
        num_joints (int): keypoint numbers
        transform (composed(operators)): A sequence of data transforms.
        shard (list): [rank, worldsize], the distributed env params
        test_mode (bool): Store True when building test or
            validation dataset. Default: False.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 transform=[],
                 shard=[0, 1],
                 test_mode=False):
        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                         transform, shard, test_mode)

        self.ann_file = os.path.join(dataset_dir, anno_path)
        self.shard = shard
        self.test_mode = test_mode

    def parse_dataset(self):
        self.coco = COCO(self.ann_file)

        self.img_ids = self.coco.getImgIds()
        if not self.test_mode:
            self.img_ids = [
                img_id for img_id in self.img_ids
                if len(self.coco.getAnnIds(
                    imgIds=img_id, iscrowd=None)) > 0
            ]
        blocknum = int(len(self.img_ids) / self.shard[1])
        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
            self.shard[0] + 1))]
        self.num_images = len(self.img_ids)
        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)

        self.dataset_name = 'crowdpose'
        print('=> num_images: {}'.format(self.num_images))


@serializable
class KeypointTopDownBaseDataset(DetDataset):
    """Base class for top_down datasets.

    All datasets should subclass it.
    All subclasses should overwrite:
        Methods:`_get_db`

    Args:
        dataset_dir (str): Root path to the dataset.
        image_dir (str): Path to a directory where images are held.
        anno_path (str): Relative path to the annotation file.
        num_joints (int): keypoint numbers
        transform (composed(operators)): A sequence of data transforms.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 transform=[]):
        super().__init__(dataset_dir, image_dir, anno_path)
        self.image_info = {}
        self.ann_info = {}

        self.img_prefix = os.path.join(dataset_dir, image_dir)
        self.transform = transform

        self.ann_info['num_joints'] = num_joints
        self.db = []

    def __len__(self):
        """Get dataset length."""
        return len(self.db)

    def _get_db(self):
        """Get a sample"""
        raise NotImplementedError

    def __getitem__(self, idx):
        """Prepare sample for training given the index."""
        records = copy.deepcopy(self.db[idx])
        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
                                      cv2.IMREAD_IGNORE_ORIENTATION)
        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
        records['score'] = records['score'] if 'score' in records else 1
        records = self.transform(records)
        # print('records', records)
        return records


@register
@serializable
class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
    """COCO dataset for top-down pose estimation. 

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    COCO keypoint indexes:

        0: 'nose',
        1: 'left_eye',
        2: 'right_eye',
        3: 'left_ear',
        4: 'right_ear',
        5: 'left_shoulder',
        6: 'right_shoulder',
        7: 'left_elbow',
        8: 'right_elbow',
        9: 'left_wrist',
        10: 'right_wrist',
        11: 'left_hip',
        12: 'right_hip',
        13: 'left_knee',
        14: 'right_knee',
        15: 'left_ankle',
        16: 'right_ankle'

    Args:
        dataset_dir (str): Root path to the dataset.
        image_dir (str): Path to a directory where images are held.
        anno_path (str): Relative path to the annotation file.
        num_joints (int): Keypoint numbers
        trainsize (list):[w, h] Image target size
        transform (composed(operators)): A sequence of data transforms.
        bbox_file (str): Path to a detection bbox file
            Default: None.
        use_gt_bbox (bool): Whether to use ground truth bbox
            Default: True.
        pixel_std (int): The pixel std of the scale
            Default: 200.
        image_thre (float): The threshold to filter the detection box
            Default: 0.0.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 trainsize,
                 transform=[],
                 bbox_file=None,
                 use_gt_bbox=True,
                 pixel_std=200,
                 image_thre=0.0,
                 center_scale=None):
        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                         transform)

        self.bbox_file = bbox_file
        self.use_gt_bbox = use_gt_bbox
        self.trainsize = trainsize
        self.pixel_std = pixel_std
        self.image_thre = image_thre
        self.center_scale = center_scale
        self.dataset_name = 'coco'

    def parse_dataset(self):
        if self.use_gt_bbox:
            self.db = self._load_coco_keypoint_annotations()
        else:
            self.db = self._load_coco_person_detection_results()

    def _load_coco_keypoint_annotations(self):
        coco = COCO(self.get_anno())
        img_ids = coco.getImgIds()
        gt_db = []
        for index in img_ids:
            im_ann = coco.loadImgs(index)[0]
            width = im_ann['width']
            height = im_ann['height']
            file_name = im_ann['file_name']
            im_id = int(im_ann["id"])

            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
            objs = coco.loadAnns(annIds)

            valid_objs = []
            for obj in objs:
                x, y, w, h = obj['bbox']
                x1 = np.max((0, x))
                y1 = np.max((0, y))
                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
                    valid_objs.append(obj)
            objs = valid_objs

            rec = []
            for obj in objs:
                if max(obj['keypoints']) == 0:
                    continue

                joints = np.zeros(
                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                joints_vis = np.zeros(
                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                for ipt in range(self.ann_info['num_joints']):
                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
                    joints[ipt, 2] = 0
                    t_vis = obj['keypoints'][ipt * 3 + 2]
                    if t_vis > 1:
                        t_vis = 1
                    joints_vis[ipt, 0] = t_vis
                    joints_vis[ipt, 1] = t_vis
                    joints_vis[ipt, 2] = 0

                center, scale = self._box2cs(obj['clean_bbox'][:4])
                rec.append({
                    'image_file': os.path.join(self.img_prefix, file_name),
                    'center': center,
                    'scale': scale,
                    'gt_joints': joints,
                    'joints_vis': joints_vis,
                    'im_id': im_id,
                })
            gt_db.extend(rec)

        return gt_db

    def _box2cs(self, box):
        x, y, w, h = box[:4]
        center = np.zeros((2), dtype=np.float32)
        center[0] = x + w * 0.5
        center[1] = y + h * 0.5
        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]

        if self.center_scale is not None and np.random.rand() < 0.3:
            center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h]

        if w > aspect_ratio * h:
            h = w * 1.0 / aspect_ratio
        elif w < aspect_ratio * h:
            w = h * aspect_ratio
        scale = np.array(
            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
            dtype=np.float32)
        if center[0] != -1:
            scale = scale * 1.25

        return center, scale

    def _load_coco_person_detection_results(self):
        all_boxes = None
        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
        with open(bbox_file_path, 'r') as f:
            all_boxes = json.load(f)

        if not all_boxes:
            print('=> Load %s fail!' % bbox_file_path)
            return None

        kpt_db = []
        for n_img in range(0, len(all_boxes)):
            det_res = all_boxes[n_img]
            if det_res['category_id'] != 1:
                continue
            file_name = det_res[
                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
                    'image_id']
            img_name = os.path.join(self.img_prefix, file_name)
            box = det_res['bbox']
            score = det_res['score']
            im_id = int(det_res['image_id'])

            if score < self.image_thre:
                continue

            center, scale = self._box2cs(box)
            joints = np.zeros(
                (self.ann_info['num_joints'], 3), dtype=np.float32)
            joints_vis = np.ones(
                (self.ann_info['num_joints'], 3), dtype=np.float32)
            kpt_db.append({
                'image_file': img_name,
                'im_id': im_id,
                'center': center,
                'scale': scale,
                'score': score,
                'gt_joints': joints,
                'joints_vis': joints_vis,
            })

        return kpt_db


@register
@serializable
class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):
    """CocoWholeBody dataset for top-down hand pose estimation. 

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    COCO-WholeBody Hand keypoint indexes:

        0: 'wrist',
        1: 'thumb1',
        2: 'thumb2',
        3: 'thumb3',
        4: 'thumb4',
        5: 'forefinger1',
        6: 'forefinger2',
        7: 'forefinger3',
        8: 'forefinger4',
        9: 'middle_finger1',
        10: 'middle_finger2',
        11: 'middle_finger3',
        12: 'middle_finger4',
        13: 'ring_finger1',
        14: 'ring_finger2',
        15: 'ring_finger3',
        16: 'ring_finger4',
        17: 'pinky_finger1',
        18: 'pinky_finger2',
        19: 'pinky_finger3',
        20: 'pinky_finger4'

    Args:
        dataset_dir (str): Root path to the dataset.
        image_dir (str): Path to a directory where images are held.
        anno_path (str): Relative path to the annotation file.
        num_joints (int): Keypoint numbers
        trainsize (list):[w, h] Image target size
        transform (composed(operators)): A sequence of data transforms.
        pixel_std (int): The pixel std of the scale
            Default: 200.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 trainsize,
                 transform=[],
                 pixel_std=200):
        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                         transform)

        self.trainsize = trainsize
        self.pixel_std = pixel_std
        self.dataset_name = 'coco_wholebady_hand'

    def _box2cs(self, box):
        x, y, w, h = box[:4]
        center = np.zeros((2), dtype=np.float32)
        center[0] = x + w * 0.5
        center[1] = y + h * 0.5
        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]

        if w > aspect_ratio * h:
            h = w * 1.0 / aspect_ratio
        elif w < aspect_ratio * h:
            w = h * aspect_ratio
        scale = np.array(
            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
            dtype=np.float32)
        if center[0] != -1:
            scale = scale * 1.25

        return center, scale

    def parse_dataset(self):
        gt_db = []
        num_joints = self.ann_info['num_joints']
        coco = COCO(self.get_anno())
        img_ids = list(coco.imgs.keys())
        for img_id in img_ids:
            im_ann = coco.loadImgs(img_id)[0]
            image_file = os.path.join(self.img_prefix, im_ann['file_name'])
            im_id = int(im_ann["id"])

            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
            objs = coco.loadAnns(ann_ids)

            for obj in objs:
                for type in ['left', 'right']:
                    if (obj[f'{type}hand_valid'] and
                            max(obj[f'{type}hand_kpts']) > 0):

                        joints = np.zeros((num_joints, 3), dtype=np.float32)
                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)

                        keypoints = np.array(obj[f'{type}hand_kpts'])
                        keypoints = keypoints.reshape(-1, 3)
                        joints[:, :2] = keypoints[:, :2]
                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])

                        center, scale = self._box2cs(obj[f'{type}hand_box'][:4])
                        gt_db.append({
                            'image_file': image_file,
                            'center': center,
                            'scale': scale,
                            'gt_joints': joints,
                            'joints_vis': joints_vis,
                            'im_id': im_id,
                        })

        self.db = gt_db


@register
@serializable
class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
    """MPII dataset for topdown pose estimation.

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    MPII keypoint indexes::

        0: 'right_ankle',
        1: 'right_knee',
        2: 'right_hip',
        3: 'left_hip',
        4: 'left_knee',
        5: 'left_ankle',
        6: 'pelvis',
        7: 'thorax',
        8: 'upper_neck',
        9: 'head_top',
        10: 'right_wrist',
        11: 'right_elbow',
        12: 'right_shoulder',
        13: 'left_shoulder',
        14: 'left_elbow',
        15: 'left_wrist',

    Args:
        dataset_dir (str): Root path to the dataset.
        image_dir (str): Path to a directory where images are held.
        anno_path (str): Relative path to the annotation file.
        num_joints (int): Keypoint numbers
        trainsize (list):[w, h] Image target size
        transform (composed(operators)): A sequence of data transforms.
    """

    def __init__(self,
                 dataset_dir,
                 image_dir,
                 anno_path,
                 num_joints,
                 transform=[]):
        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                         transform)

        self.dataset_name = 'mpii'

    def parse_dataset(self):
        with open(self.get_anno()) as anno_file:
            anno = json.load(anno_file)

        gt_db = []
        for a in anno:
            image_name = a['image']
            im_id = a['image_id'] if 'image_id' in a else int(
                os.path.splitext(image_name)[0])

            c = np.array(a['center'], dtype=np.float32)
            s = np.array([a['scale'], a['scale']], dtype=np.float32)

            # Adjust center/scale slightly to avoid cropping limbs
            if c[0] != -1:
                c[1] = c[1] + 15 * s[1]
                s = s * 1.25
            c = c - 1

            joints = np.zeros(
                (self.ann_info['num_joints'], 3), dtype=np.float32)
            joints_vis = np.zeros(
                (self.ann_info['num_joints'], 3), dtype=np.float32)
            if 'gt_joints' in a:
                joints_ = np.array(a['gt_joints'])
                joints_[:, 0:2] = joints_[:, 0:2] - 1
                joints_vis_ = np.array(a['joints_vis'])
                assert len(joints_) == self.ann_info[
                    'num_joints'], 'joint num diff: {} vs {}'.format(
                        len(joints_), self.ann_info['num_joints'])

                joints[:, 0:2] = joints_[:, 0:2]
                joints_vis[:, 0] = joints_vis_[:]
                joints_vis[:, 1] = joints_vis_[:]

            gt_db.append({
                'image_file': os.path.join(self.img_prefix, image_name),
                'im_id': im_id,
                'center': c,
                'scale': s,
                'gt_joints': joints,
                'joints_vis': joints_vis
            })
        print("number length: {}".format(len(gt_db)))
        self.db = gt_db


================================================
FILE: ppdet/data/source/lvis.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import cv2
import copy
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
import numpy as np
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'LVISDataSet',
]


@register
@serializable
class LVISDataSet(DetDataset):
    """
    Load dataset with LVISDataSet format.

    Args:
        dataset_dir (str): root directory for dataset.
        image_dir (str): directory for images.
        anno_path (str): coco annotation file path.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        load_crowd (bool): whether to load crowded ground-truth. 
            False as default
        allow_empty (bool): whether to load empty entry. False as default
        empty_ratio (float): the ratio of empty record number to total 
            record's, if empty_ratio is out of [0. ,1.), do not sample the 
            records and use all the empty entries. 1. as default
        repeat (int): repeat times for dataset, use in benchmark.
    """

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 load_crowd=False,
                 allow_empty=False,
                 empty_ratio=1.,
                 repeat=1):
        super(LVISDataSet, self).__init__(
            dataset_dir,
            image_dir,
            anno_path,
            data_fields,
            sample_num,
            repeat=repeat)
        self.load_image_only = False
        self.load_semantic = False
        self.load_crowd = load_crowd
        self.allow_empty = allow_empty
        self.empty_ratio = empty_ratio

    def _sample_empty(self, records, num):
        # if empty_ratio is out of [0. ,1.), do not sample the records
        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
            return records
        import random
        sample_num = min(
            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
        records = random.sample(records, sample_num)
        return records

    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        assert anno_path.endswith('.json'), \
            'invalid coco annotation file: ' + anno_path
        from lvis import LVIS
        lvis_ = LVIS(anno_path)
        img_ids = lvis_.get_img_ids()
        img_ids.sort()
        cat_ids = lvis_.get_cat_ids()
        records = []
        empty_records = []
        ct = 0
        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
        self.cname2cid = dict({
            lvis_.load_cats([catid])[0]['name']: clsid
            for catid, clsid in self.catid2clsid.items()
        })

        if 'annotations' not in lvis_.dataset:
            self.load_image_only = True
            logger.warning('Annotation file: {} does not contains ground truth '
                           'and load image information only.'.format(anno_path))

        for img_id in img_ids:
            img_anno = lvis_.load_imgs([img_id])[0]
            im_fname = img_anno['coco_url'].replace('http://images.cocodataset.org/', '')
            im_w = float(img_anno['width'])
            im_h = float(img_anno['height'])

            im_path = os.path.join(image_dir,
                                   im_fname) if image_dir else im_fname
            is_empty = False
            if not os.path.exists(im_path):
                logger.warning('Illegal image file: {}, and it will be '
                               'ignored'.format(im_path))
                continue

            if im_w < 0 or im_h < 0:
                logger.warning('Illegal width: {} or height: {} in annotation, '
                               'and im_id: {} will be ignored'.format(
                                   im_w, im_h, img_id))
                continue

            coco_rec = {
                'im_file': im_path,
                'im_id': np.array([img_id]),
                'h': im_h,
                'w': im_w,
            } if 'image' in self.data_fields else {}

            if not self.load_image_only:
                ins_anno_ids = lvis_.get_ann_ids(img_ids=[img_id])
                instances = lvis_.load_anns(ins_anno_ids)

                bboxes = []
                is_rbox_anno = False
                for inst in instances:
                    # check gt bbox
                    if inst.get('ignore', False):
                        continue
                    if 'bbox' not in inst.keys():
                        continue
                    else:
                        if not any(np.array(inst['bbox'])):
                            continue

                    x1, y1, box_w, box_h = inst['bbox']
                    x2 = x1 + box_w
                    y2 = y1 + box_h
                    eps = 1e-5
                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
                        inst['clean_bbox'] = [
                            round(float(x), 3) for x in [x1, y1, x2, y2]
                        ]
                        bboxes.append(inst)
                    else:
                        logger.warning(
                            'Found an invalid bbox in annotations: im_id: {}, '
                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                                img_id, float(inst['area']), x1, y1, x2, y2))

                num_bbox = len(bboxes)
                if num_bbox <= 0 and not self.allow_empty:
                    continue
                elif num_bbox <= 0:
                    is_empty = True

                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                gt_poly = [None] * num_bbox
                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)

                has_segmentation = False
                has_track_id = False
                for i, box in enumerate(bboxes):
                    catid = box['category_id']
                    gt_class[i][0] = self.catid2clsid[catid]
                    gt_bbox[i, :] = box['clean_bbox']
                    
                    # is_crowd[i][0] = box['iscrowd']
                    # check RLE format 
                    # if 'segmentation' in box and box['iscrowd'] == 1:
                    #     gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                    # elif 'segmentation' in box and box['segmentation']:
                    #     if not np.array(
                    #             box['segmentation'],
                    #             dtype=object).size > 0 and not self.allow_empty:
                    #         bboxes.pop(i)
                    #         gt_poly.pop(i)
                    #         np.delete(is_crowd, i)
                    #         np.delete(gt_class, i)
                    #         np.delete(gt_bbox, i)
                    #     else:
                    #         gt_poly[i] = box['segmentation']
                    #     has_segmentation = True

                    if 'track_id' in box:
                        gt_track_id[i][0] = box['track_id']
                        has_track_id = True
                if has_segmentation and not any(
                        gt_poly) and not self.allow_empty:
                    continue

                gt_rec = {
                    'is_crowd': is_crowd,
                    'gt_class': gt_class,
                    'gt_bbox': gt_bbox,
                    'gt_poly': gt_poly,
                }
                if has_track_id:
                    gt_rec.update({'gt_track_id': gt_track_id})

                for k, v in gt_rec.items():
                    if k in self.data_fields:
                        coco_rec[k] = v

                # TODO: remove load_semantic
                if self.load_semantic and 'semantic' in self.data_fields:
                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
                                            'train2017', im_fname[:-3] + 'png')
                    coco_rec.update({'semantic': seg_path})

            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
                im_path, img_id, im_h, im_w))
            if is_empty:
                empty_records.append(coco_rec)
            else:
                records.append(coco_rec)
            ct += 1
            if self.sample_num > 0 and ct >= self.sample_num:
                break
        assert ct > 0, 'not found any coco record in %s' % (anno_path)
        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
                    format(ct, len(img_ids) - ct, anno_path))
        if self.allow_empty and len(empty_records) > 0:
            empty_records = self._sample_empty(empty_records, len(records))
            records += empty_records
        self.roidbs = records

================================================
FILE: ppdet/data/source/mot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import cv2
import glob
import numpy as np
from collections import OrderedDict, defaultdict
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
from .dataset import DetDataset, _make_dataset, _is_valid_file
from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@register
@serializable
class MOTDataSet(DetDataset):
    """
    Load dataset with MOT format, only support single class MOT.

    Args:
        dataset_dir (str): root directory for dataset.
        image_lists (str|list): mot data image lists, muiti-source mot dataset.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        repeat (int): repeat times for dataset, use in benchmark.

    Notes:
        MOT datasets root directory following this:
            dataset/mot
            |——————image_lists
            |        |——————caltech.train  
            |        |——————caltech.val   
            |        |——————mot16.train  
            |        |——————mot17.train  
            |        ......
            |——————Caltech
            |——————MOT17
            |——————......

        All the MOT datasets have the following structure:
            Caltech
            |——————images
            |        └——————00001.jpg
            |        |—————— ...
            |        └——————0000N.jpg
            └——————labels_with_ids
                        └——————00001.txt
                        |—————— ...
                        └——————0000N.txt
            or

            MOT17
            |——————images
            |        └——————train
            |        └——————test
            └——————labels_with_ids
                        └——————train
    """

    def __init__(self,
                 dataset_dir=None,
                 image_lists=[],
                 data_fields=['image'],
                 sample_num=-1,
                 repeat=1):
        super(MOTDataSet, self).__init__(
            dataset_dir=dataset_dir,
            data_fields=data_fields,
            sample_num=sample_num,
            repeat=repeat)
        self.dataset_dir = dataset_dir
        self.image_lists = image_lists
        if isinstance(self.image_lists, str):
            self.image_lists = [self.image_lists]
        self.roidbs = None
        self.cname2cid = None

    def get_anno(self):
        if self.image_lists == []:
            return
        # only used to get categories and metric
        # only check first data, but the label_list of all data should be same.
        first_mot_data = self.image_lists[0].split('.')[0]
        anno_file = os.path.join(self.dataset_dir, first_mot_data,
                                 'label_list.txt')
        return anno_file

    def parse_dataset(self):
        self.img_files = OrderedDict()
        self.img_start_index = OrderedDict()
        self.label_files = OrderedDict()
        self.tid_num = OrderedDict()
        self.tid_start_index = OrderedDict()

        img_index = 0
        for data_name in self.image_lists:
            # check every data image list
            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
            assert os.path.isdir(image_lists_dir), \
                "The {} is not a directory.".format(image_lists_dir)

            list_path = os.path.join(image_lists_dir, data_name)
            assert os.path.exists(list_path), \
                "The list path {} does not exist.".format(list_path)

            # record img_files, filter out empty ones
            with open(list_path, 'r') as file:
                self.img_files[data_name] = file.readlines()
                self.img_files[data_name] = [
                    os.path.join(self.dataset_dir, x.strip())
                    for x in self.img_files[data_name]
                ]
                self.img_files[data_name] = list(
                    filter(lambda x: len(x) > 0, self.img_files[data_name]))

                self.img_start_index[data_name] = img_index
                img_index += len(self.img_files[data_name])

            # record label_files
            self.label_files[data_name] = [
                x.replace('images', 'labels_with_ids').replace(
                    '.png', '.txt').replace('.jpg', '.txt')
                for x in self.img_files[data_name]
            ]

        for data_name, label_paths in self.label_files.items():
            max_index = -1
            for lp in label_paths:
                lb = np.loadtxt(lp)
                if len(lb) < 1:
                    continue
                if len(lb.shape) < 2:
                    img_max = lb[1]
                else:
                    img_max = np.max(lb[:, 1])
                if img_max > max_index:
                    max_index = img_max
            self.tid_num[data_name] = int(max_index + 1)

        last_index = 0
        for i, (k, v) in enumerate(self.tid_num.items()):
            self.tid_start_index[k] = last_index
            last_index += v

        self.num_identities_dict = defaultdict(int)
        self.num_identities_dict[0] = int(last_index + 1)  # single class
        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
        self.total_imgs = sum(self.num_imgs_each_data)

        logger.info('MOT dataset summary: ')
        logger.info(self.tid_num)
        logger.info('Total images: {}'.format(self.total_imgs))
        logger.info('Image start index: {}'.format(self.img_start_index))
        logger.info('Total identities: {}'.format(self.num_identities_dict[0]))
        logger.info('Identity start index: {}'.format(self.tid_start_index))

        records = []
        cname2cid = mot_label()

        for img_index in range(self.total_imgs):
            for i, (k, v) in enumerate(self.img_start_index.items()):
                if img_index >= v:
                    data_name = list(self.label_files.keys())[i]
                    start_index = v
            img_file = self.img_files[data_name][img_index - start_index]
            lbl_file = self.label_files[data_name][img_index - start_index]

            if not os.path.exists(img_file):
                logger.warning('Illegal image file: {}, and it will be ignored'.
                               format(img_file))
                continue
            if not os.path.isfile(lbl_file):
                logger.warning('Illegal label file: {}, and it will be ignored'.
                               format(lbl_file))
                continue

            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]

            cx, cy = labels[:, 2], labels[:, 3]
            w, h = labels[:, 4], labels[:, 5]
            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
            gt_class = labels[:, 0:1].astype('int32')
            gt_score = np.ones((len(labels), 1)).astype('float32')
            gt_ide = labels[:, 1:2].astype('int32')
            for i, _ in enumerate(gt_ide):
                if gt_ide[i] > -1:
                    gt_ide[i] += self.tid_start_index[data_name]

            mot_rec = {
                'im_file': img_file,
                'im_id': img_index,
            } if 'image' in self.data_fields else {}

            gt_rec = {
                'gt_class': gt_class,
                'gt_score': gt_score,
                'gt_bbox': gt_bbox,
                'gt_ide': gt_ide,
            }

            for k, v in gt_rec.items():
                if k in self.data_fields:
                    mot_rec[k] = v

            records.append(mot_rec)
            if self.sample_num > 0 and img_index >= self.sample_num:
                break
        assert len(records) > 0, 'not found any mot record in %s' % (
            self.image_lists)
        self.roidbs, self.cname2cid = records, cname2cid


@register
@serializable
class MCMOTDataSet(DetDataset):
    """
    Load dataset with MOT format, support multi-class MOT.

    Args:
        dataset_dir (str): root directory for dataset.
        image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.
        data_fields (list): key name of data dictionary, at least have 'image'.
        label_list (str): if use_default_label is False, will load
            mapping between category and class index.
        sample_num (int): number of samples to load, -1 means all.

    Notes:
        MCMOT datasets root directory following this:
            dataset/mot
            |——————image_lists
            |        |——————visdrone_mcmot.train  
            |        |——————visdrone_mcmot.val   
            visdrone_mcmot
            |——————images
            |        └——————train
            |        └——————val
            └——————labels_with_ids
                        └——————train
    """

    def __init__(self,
                 dataset_dir=None,
                 image_lists=[],
                 data_fields=['image'],
                 label_list=None,
                 sample_num=-1):
        super(MCMOTDataSet, self).__init__(
            dataset_dir=dataset_dir,
            data_fields=data_fields,
            sample_num=sample_num)
        self.dataset_dir = dataset_dir
        self.image_lists = image_lists
        if isinstance(self.image_lists, str):
            self.image_lists = [self.image_lists]
        self.label_list = label_list
        self.roidbs = None
        self.cname2cid = None

    def get_anno(self):
        if self.image_lists == []:
            return
        # only used to get categories and metric
        # only check first data, but the label_list of all data should be same.
        first_mot_data = self.image_lists[0].split('.')[0]
        anno_file = os.path.join(self.dataset_dir, first_mot_data,
                                 'label_list.txt')
        return anno_file

    def parse_dataset(self):
        self.img_files = OrderedDict()
        self.img_start_index = OrderedDict()
        self.label_files = OrderedDict()
        self.tid_num = OrderedDict()
        self.tid_start_idx_of_cls_ids = defaultdict(dict)  # for MCMOT

        img_index = 0
        for data_name in self.image_lists:
            # check every data image list
            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
            assert os.path.isdir(image_lists_dir), \
                "The {} is not a directory.".format(image_lists_dir)

            list_path = os.path.join(image_lists_dir, data_name)
            assert os.path.exists(list_path), \
                "The list path {} does not exist.".format(list_path)

            # record img_files, filter out empty ones
            with open(list_path, 'r') as file:
                self.img_files[data_name] = file.readlines()
                self.img_files[data_name] = [
                    os.path.join(self.dataset_dir, x.strip())
                    for x in self.img_files[data_name]
                ]
                self.img_files[data_name] = list(
                    filter(lambda x: len(x) > 0, self.img_files[data_name]))

                self.img_start_index[data_name] = img_index
                img_index += len(self.img_files[data_name])

            # record label_files
            self.label_files[data_name] = [
                x.replace('images', 'labels_with_ids').replace(
                    '.png', '.txt').replace('.jpg', '.txt')
                for x in self.img_files[data_name]
            ]

        for data_name, label_paths in self.label_files.items():
            # using max_ids_dict rather than max_index
            max_ids_dict = defaultdict(int)
            for lp in label_paths:
                lb = np.loadtxt(lp)
                if len(lb) < 1:
                    continue
                lb = lb.reshape(-1, 6)
                for item in lb:
                    if item[1] > max_ids_dict[int(item[0])]:
                        # item[0]: cls_id
                        # item[1]: track id
                        max_ids_dict[int(item[0])] = int(item[1])
            # track id number
            self.tid_num[data_name] = max_ids_dict

        last_idx_dict = defaultdict(int)
        for i, (k, v) in enumerate(self.tid_num.items()):  # each sub dataset
            for cls_id, id_num in v.items():  # v is a max_ids_dict
                self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id]
                last_idx_dict[cls_id] += id_num

        self.num_identities_dict = defaultdict(int)
        for k, v in last_idx_dict.items():
            self.num_identities_dict[k] = int(v)  # total ids of each category

        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
        self.total_imgs = sum(self.num_imgs_each_data)

        # cname2cid and cid2cname 
        cname2cid = {}
        if self.label_list is not None:
            # if use label_list for multi source mix dataset, 
            # please make sure label_list in the first sub_dataset at least.
            sub_dataset = self.image_lists[0].split('.')[0]
            label_path = os.path.join(self.dataset_dir, sub_dataset,
                                      self.label_list)
            if not os.path.exists(label_path):
                logger.info(
                    "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.".
                    format(label_path))
                cname2cid = visdrone_mcmot_label()
            else:
                with open(label_path, 'r') as fr:
                    label_id = 0
                    for line in fr.readlines():
                        cname2cid[line.strip()] = label_id
                        label_id += 1
        else:
            cname2cid = visdrone_mcmot_label()

        cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])

        logger.info('MCMOT dataset summary: ')
        logger.info(self.tid_num)
        logger.info('Total images: {}'.format(self.total_imgs))
        logger.info('Image start index: {}'.format(self.img_start_index))

        logger.info('Total identities of each category: ')
        num_identities_dict = sorted(
            self.num_identities_dict.items(), key=lambda x: x[0])
        total_IDs_all_cats = 0
        for (k, v) in num_identities_dict:
            logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],
                                                              v))
            total_IDs_all_cats += v
        logger.info('Total identities of all categories: {}'.format(
            total_IDs_all_cats))

        logger.info('Identity start index of each category: ')
        for k, v in self.tid_start_idx_of_cls_ids.items():
            sorted_v = sorted(v.items(), key=lambda x: x[0])
            for (cls_id, start_idx) in sorted_v:
                logger.info('Start index of dataset {} category {:d} is {:d}'
                            .format(k, cls_id, start_idx))

        records = []
        for img_index in range(self.total_imgs):
            for i, (k, v) in enumerate(self.img_start_index.items()):
                if img_index >= v:
                    data_name = list(self.label_files.keys())[i]
                    start_index = v
            img_file = self.img_files[data_name][img_index - start_index]
            lbl_file = self.label_files[data_name][img_index - start_index]

            if not os.path.exists(img_file):
                logger.warning('Illegal image file: {}, and it will be ignored'.
                               format(img_file))
                continue
            if not os.path.isfile(lbl_file):
                logger.warning('Illegal label file: {}, and it will be ignored'.
                               format(lbl_file))
                continue

            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]

            cx, cy = labels[:, 2], labels[:, 3]
            w, h = labels[:, 4], labels[:, 5]
            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
            gt_class = labels[:, 0:1].astype('int32')
            gt_score = np.ones((len(labels), 1)).astype('float32')
            gt_ide = labels[:, 1:2].astype('int32')
            for i, _ in enumerate(gt_ide):
                if gt_ide[i] > -1:
                    cls_id = int(gt_class[i])
                    start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id]
                    gt_ide[i] += start_idx

            mot_rec = {
                'im_file': img_file,
                'im_id': img_index,
            } if 'image' in self.data_fields else {}

            gt_rec = {
                'gt_class': gt_class,
                'gt_score': gt_score,
                'gt_bbox': gt_bbox,
                'gt_ide': gt_ide,
            }

            for k, v in gt_rec.items():
                if k in self.data_fields:
                    mot_rec[k] = v

            records.append(mot_rec)
            if self.sample_num > 0 and img_index >= self.sample_num:
                break
        assert len(records) > 0, 'not found any mot record in %s' % (
            self.image_lists)
        self.roidbs, self.cname2cid = records, cname2cid


@register
@serializable
class MOTImageFolder(DetDataset):
    """
    Load MOT dataset with MOT format from image folder or video .
    Args:
        video_file (str): path of the video file, default ''.
        frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.
        dataset_dir (str): root directory for dataset.
        keep_ori_im (bool): whether to keep original image, default False. 
            Set True when used during MOT model inference while saving
            images or video, or used in DeepSORT.
    """

    def __init__(self,
                 video_file=None,
                 frame_rate=-1,
                 dataset_dir=None,
                 data_root=None,
                 image_dir=None,
                 sample_num=-1,
                 keep_ori_im=False,
                 anno_path=None,
                 **kwargs):
        super(MOTImageFolder, self).__init__(
            dataset_dir, image_dir, sample_num=sample_num)
        self.video_file = video_file
        self.data_root = data_root
        self.keep_ori_im = keep_ori_im
        self._imid2path = {}
        self.roidbs = None
        self.frame_rate = frame_rate
        self.anno_path = anno_path

    def check_or_download_dataset(self):
        return

    def parse_dataset(self, ):
        if not self.roidbs:
            if self.video_file is None:
                self.frame_rate = 30  # set as default if infer image folder
                self.roidbs = self._load_images()
            else:
                self.roidbs = self._load_video_images()

    def _load_video_images(self):
        if self.frame_rate == -1:
            # if frame_rate is not set for video, use cv2.VideoCapture
            cap = cv2.VideoCapture(self.video_file)
            self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))

        extension = self.video_file.split('.')[-1]
        output_path = self.video_file.replace('.{}'.format(extension), '')
        frames_path = video2frames(self.video_file, output_path,
                                   self.frame_rate)
        self.video_frames = sorted(
            glob.glob(os.path.join(frames_path, '*.png')))

        self.video_length = len(self.video_frames)
        logger.info('Length of the video: {:d} frames.'.format(
            self.video_length))
        ct = 0
        records = []
        for image in self.video_frames:
            assert image != '' and os.path.isfile(image), \
                    "Image {} not found".format(image)
            if self.sample_num > 0 and ct >= self.sample_num:
                break
            rec = {'im_id': np.array([ct]), 'im_file': image}
            if self.keep_ori_im:
                rec.update({'keep_ori_im': 1})
            self._imid2path[ct] = image
            ct += 1
            records.append(rec)
        assert len(records) > 0, "No image file found"
        return records

    def _find_images(self):
        image_dir = self.image_dir
        if not isinstance(image_dir, Sequence):
            image_dir = [image_dir]
        images = []
        for im_dir in image_dir:
            if os.path.isdir(im_dir):
                im_dir = os.path.join(self.dataset_dir, im_dir)
                images.extend(_make_dataset(im_dir))
            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
                images.append(im_dir)
        return images

    def _load_images(self):
        images = self._find_images()
        ct = 0
        records = []
        for image in images:
            assert image != '' and os.path.isfile(image), \
                    "Image {} not found".format(image)
            if self.sample_num > 0 and ct >= self.sample_num:
                break
            rec = {'im_id': np.array([ct]), 'im_file': image}
            if self.keep_ori_im:
                rec.update({'keep_ori_im': 1})
            self._imid2path[ct] = image
            ct += 1
            records.append(rec)
        assert len(records) > 0, "No image file found"
        return records

    def get_imid2path(self):
        return self._imid2path

    def set_images(self, images):
        self.image_dir = images
        self.roidbs = self._load_images()

    def set_video(self, video_file, frame_rate):
        # update video_file and frame_rate by command line of tools/infer_mot.py
        self.video_file = video_file
        self.frame_rate = frame_rate
        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
                "wrong or unsupported file format: {}".format(self.video_file)
        self.roidbs = self._load_video_images()

    def get_anno(self):
        return self.anno_path


def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
    return f.lower().endswith(extensions)


def video2frames(video_path, outpath, frame_rate, **kargs):
    def _dict2str(kargs):
        cmd_str = ''
        for k, v in kargs.items():
            cmd_str += (' ' + str(k) + ' ' + str(v))
        return cmd_str

    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
    vid_name = os.path.basename(video_path).split('.')[0]
    out_full_path = os.path.join(outpath, vid_name)

    if not os.path.exists(out_full_path):
        os.makedirs(out_full_path)

    # video file name
    outformat = os.path.join(out_full_path, '%08d.png')

    cmd = ffmpeg
    cmd = ffmpeg + [
        ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat
    ]
    cmd = ''.join(cmd) + _dict2str(kargs)

    if os.system(cmd) != 0:
        raise RuntimeError('ffmpeg process video: {} error'.format(video_path))
        sys.exit(-1)

    sys.stdout.flush()
    return out_full_path


def mot_label():
    labels_map = {'person': 0}
    return labels_map


def visdrone_mcmot_label():
    labels_map = {
        'pedestrian': 0,
        'people': 1,
        'bicycle': 2,
        'car': 3,
        'van': 4,
        'truck': 5,
        'tricycle': 6,
        'awning-tricycle': 7,
        'bus': 8,
        'motor': 9,
    }
    return labels_map


================================================
FILE: ppdet/data/source/pose3d_cmb.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import cv2
import numpy as np
import json
import copy
import pycocotools
from pycocotools.coco import COCO
from .dataset import DetDataset
from ppdet.core.workspace import register, serializable
from paddle.io import Dataset


@serializable
class Pose3DDataset(DetDataset):
    """Pose3D Dataset class. 

    Args:
        dataset_dir (str): Root path to the dataset.
        anno_list (list of str): each of the element is a relative path to the annotation file.
        image_dirs (list of str): each of path is a relative path where images are held.
        transform (composed(operators)): A sequence of data transforms.
        test_mode (bool): Store True when building test or
            validation dataset. Default: False.
        24 joints order:
        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 
        3-5:'L_Hip', 'L_Knee', 'L_Ankle', 
        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 
        9-11:'L_Shoulder','L_Elbow','L_Wrist',
        12-14:'Neck','Top_of_Head','Pelvis',
        15-18:'Thorax','Spine','Jaw','Head',
        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
    """

    def __init__(self,
                 dataset_dir,
                 image_dirs,
                 anno_list,
                 transform=[],
                 num_joints=24,
                 test_mode=False):
        super().__init__(dataset_dir, image_dirs, anno_list)
        self.image_info = {}
        self.ann_info = {}
        self.num_joints = num_joints

        self.transform = transform
        self.test_mode = test_mode

        self.img_ids = []
        self.dataset_dir = dataset_dir
        self.image_dirs = image_dirs
        self.anno_list = anno_list

    def get_mask(self, mvm_percent=0.3):
        num_joints = self.num_joints
        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
        if self.test_mode == False:
            pb = np.random.random_sample()
            masked_num = int(
                pb * mvm_percent *
                num_joints)  # at most x% of the joints could be masked
            indices = np.random.choice(
                np.arange(num_joints), replace=False, size=masked_num)
            mjm_mask[indices, :] = 0.0
        # return mjm_mask

        num_joints = 10
        mvm_mask = np.ones((num_joints, 1)).astype(np.float)
        if self.test_mode == False:
            num_vertices = num_joints
            pb = np.random.random_sample()
            masked_num = int(
                pb * mvm_percent *
                num_vertices)  # at most x% of the vertices could be masked
            indices = np.random.choice(
                np.arange(num_vertices), replace=False, size=masked_num)
            mvm_mask[indices, :] = 0.0

        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
        return mjm_mask

    def filterjoints(self, x):
        if self.num_joints == 24:
            return x
        elif self.num_joints == 14:
            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
        elif self.num_joints == 17:
            return x[
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
        else:
            raise ValueError(
                "unsupported joint numbers, only [24 or 17 or 14] is supported!")

    def parse_dataset(self):
        print("Loading annotations..., please wait")
        self.annos = []
        im_id = 0
        self.human36m_num = 0
        for idx, annof in enumerate(self.anno_list):
            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
            dataf = os.path.join(self.dataset_dir, annof)
            with open(dataf, 'r') as rf:
                anno_data = json.load(rf)
                annos = anno_data['data']
                new_annos = []
                print("{} has annos numbers: {}".format(dataf, len(annos)))
                for anno in annos:
                    new_anno = {}
                    new_anno['im_id'] = im_id
                    im_id += 1
                    imagename = anno['imageName']
                    if imagename.startswith("COCO_train2014_"):
                        imagename = imagename[len("COCO_train2014_"):]
                    elif imagename.startswith("COCO_val2014_"):
                        imagename = imagename[len("COCO_val2014_"):]
                    imagename = os.path.join(img_prefix, imagename)
                    if not os.path.exists(imagename):
                        if "train2017" in imagename:
                            imagename = imagename.replace("train2017",
                                                          "val2017")
                            if not os.path.exists(imagename):
                                print("cannot find imagepath:{}".format(
                                    imagename))
                                continue
                        else:
                            print("cannot find imagepath:{}".format(imagename))
                            continue
                    new_anno['imageName'] = imagename
                    if 'human3.6m' in imagename:
                        self.human36m_num += 1
                    new_anno['bbox_center'] = anno['bbox_center']
                    new_anno['bbox_scale'] = anno['bbox_scale']
                    new_anno['joints_2d'] = np.array(anno[
                        'gt_keypoint_2d']).astype(np.float32)
                    if new_anno['joints_2d'].shape[0] == 49:
                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]
                    new_anno['joints_3d'] = np.array(anno[
                        'pose3d'])[:, :3].astype(np.float32)
                    new_anno['mjm_mask'] = self.get_mask()
                    if not 'has_3d_joints' in anno:
                        new_anno['has_3d_joints'] = int(1)
                        new_anno['has_2d_joints'] = int(1)
                    else:
                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
                    new_anno['joints_2d'] = self.filterjoints(new_anno[
                        'joints_2d'])
                    self.annos.append(new_anno)
                del annos

    def get_temp_num(self):
        """get temporal data number, like human3.6m"""
        return self.human36m_num

    def __len__(self):
        """Get dataset length."""
        return len(self.annos)

    def _get_imganno(self, idx):
        """Get anno for a single image."""
        return self.annos[idx]

    def __getitem__(self, idx):
        """Prepare image for training given the index."""
        records = copy.deepcopy(self._get_imganno(idx))
        imgpath = records['imageName']
        assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
        records['image'] = cv2.imread(imgpath)
        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
        records = self.transform(records)
        return records

    def check_or_download_dataset(self):
        alldatafind = True
        for image_dir in self.image_dirs:
            image_dir = os.path.join(self.dataset_dir, image_dir)
            if not os.path.isdir(image_dir):
                print("dataset [{}] is not found".format(image_dir))
                alldatafind = False
        if not alldatafind:
            raise ValueError(
                "Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
            )


@register
@serializable
class Keypoint3DMultiFramesDataset(Dataset):
    """24 keypoints 3D dataset for pose estimation. 

    each item is a list of images

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    Args:
        dataset_dir (str): Root path to the dataset.
        image_dir (str): Path to a directory where images are held.
    """

    def __init__(
            self,
            dataset_dir,  # 数据集根目录
            image_dir,  # 图像文件夹
            p3d_dir,  # 3D关键点文件夹
            json_path,
            img_size,  #图像resize大小
            num_frames,  # 帧序列长度
            anno_path=None, ):

        self.dataset_dir = dataset_dir
        self.image_dir = image_dir
        self.p3d_dir = p3d_dir
        self.json_path = json_path
        self.img_size = img_size
        self.num_frames = num_frames
        self.anno_path = anno_path

        self.data_labels, self.mf_inds = self._generate_multi_frames_list()

    def _generate_multi_frames_list(self):
        act_list = os.listdir(self.dataset_dir)  # 动作列表
        count = 0
        mf_list = []
        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
        for act in act_list:  #对每个动作，生成帧序列
            if '.' in act:
                continue

            json_path = os.path.join(self.dataset_dir, act, self.json_path)
            with open(json_path, 'r') as j:
                annos = json.load(j)
            length = len(annos['images'])
            for k, v in annos.items():
                if k in annos_dict:
                    annos_dict[k].extend(v)
            annos_dict['act_inds'].extend([act] * length)

            mf = [[i + j + count for j in range(self.num_frames)]
                  for i in range(0, length - self.num_frames + 1)]
            mf_list.extend(mf)
            count += length

        print("total data number:", len(mf_list))
        return annos_dict, mf_list

    def __call__(self, *args, **kwargs):
        return self

    def __getitem__(self, index):  # 拿一个连续的序列
        inds = self.mf_inds[
            index]  # 如[568, 569, 570, 571, 572, 573]，长度为num_frames

        images = self.data_labels['images']  # all images
        annots = self.data_labels['annotations']  # all annots

        act = self.data_labels['act_inds'][inds[0]]  # 动作名（文件夹名）

        kps3d_list = []
        kps3d_vis_list = []
        names = []

        h, w = 0, 0
        for ind in inds:  # one image
            height = float(images[ind]['height'])
            width = float(images[ind]['width'])
            name = images[ind]['file_name']  # 图像名称，带有后缀

            kps3d_name = name.split('.')[0] + '.obj'
            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
                                      kps3d_name)

            joints, joints_vis = self.kps3d_process(kps3d_path)
            joints_vis = np.array(joints_vis, dtype=np.float32)

            kps3d_list.append(joints)
            kps3d_vis_list.append(joints_vis)
            names.append(name)

        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)
        kps3d_vis = np.array(kps3d_vis_list)

        # read image
        imgs = []
        for name in names:
            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)

            image = cv2.imread(img_path, cv2.IMREAD_COLOR |
                               cv2.IMREAD_IGNORE_ORIENTATION)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            imgs.append(np.expand_dims(image, axis=0))

        imgs = np.concatenate(imgs, axis=0)
        imgs = imgs.astype(
            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)

        # attention: 此时图像和标注是镜像的
        records = {
            'kps3d': kps3d,
            'kps3d_vis': kps3d_vis,
            "image": imgs,
            'act': act,
            'names': names,
            'im_id': index
        }

        return self.transform(records)

    def kps3d_process(self, kps3d_path):
        count = 0
        kps = []
        kps_vis = []

        with open(kps3d_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if line[0] == 'v':
                    kps.append([])
                    line = line.strip('\n').split(' ')[1:]
                    for kp in line:
                        kps[-1].append(float(kp))
                    count += 1

                    kps_vis.append([1, 1, 1])

        kps = np.array(kps)  # 52，3
        kps_vis = np.array(kps_vis)

        kps *= 10  # scale points
        kps -= kps[[0], :]  # set root point to zero

        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3

        kps *= 10

        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3

        return kps, kps_vis

    def __len__(self):
        return len(self.mf_inds)

    def get_anno(self):
        if self.anno_path is None:
            return
        return os.path.join(self.dataset_dir, self.anno_path)

    def check_or_download_dataset(self):
        return

    def parse_dataset(self, ):
        return

    def set_transform(self, transform):
        self.transform = transform

    def set_epoch(self, epoch_id):
        self._epoch = epoch_id

    def set_kwargs(self, **kwargs):
        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)


================================================
FILE: ppdet/data/source/sniper_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import cv2
import json
import copy
import numpy as np

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

from ppdet.core.workspace import register, serializable
from ppdet.data.crop_utils.annotation_cropper import AnnoCropper
from .coco import COCODataSet
from .dataset import _make_dataset, _is_valid_file
from ppdet.utils.logger import setup_logger

logger = setup_logger('sniper_coco_dataset')


@register
@serializable
class SniperCOCODataSet(COCODataSet):
    """SniperCOCODataSet"""

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 proposals_file=None,
                 data_fields=['image'],
                 sample_num=-1,
                 load_crowd=False,
                 allow_empty=True,
                 empty_ratio=1.,
                 is_trainset=True,
                 image_target_sizes=[2000, 1000],
                 valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]],
                 chip_target_size=500,
                 chip_target_stride=200,
                 use_neg_chip=False,
                 max_neg_num_per_im=8,
                 max_per_img=-1,
                 nms_thresh=0.5):
        super(SniperCOCODataSet, self).__init__(
            dataset_dir=dataset_dir,
            image_dir=image_dir,
            anno_path=anno_path,
            data_fields=data_fields,
            sample_num=sample_num,
            load_crowd=load_crowd,
            allow_empty=allow_empty,
            empty_ratio=empty_ratio
        )
        self.proposals_file = proposals_file
        self.proposals = None
        self.anno_cropper = None
        self.is_trainset = is_trainset
        self.image_target_sizes = image_target_sizes
        self.valid_box_ratio_ranges = valid_box_ratio_ranges
        self.chip_target_size = chip_target_size
        self.chip_target_stride = chip_target_stride
        self.use_neg_chip = use_neg_chip
        self.max_neg_num_per_im = max_neg_num_per_im
        self.max_per_img = max_per_img
        self.nms_thresh = nms_thresh


    def parse_dataset(self):
        if not hasattr(self, "roidbs"):
            super(SniperCOCODataSet, self).parse_dataset()
        if self.is_trainset:
            self._parse_proposals()
            self._merge_anno_proposals()
        self.ori_roidbs = copy.deepcopy(self.roidbs)
        self.init_anno_cropper()
        self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)

    def set_proposals_file(self, file_path):
        self.proposals_file = file_path

    def init_anno_cropper(self):
        logger.info("Init AnnoCropper...")
        self.anno_cropper = AnnoCropper(
            image_target_sizes=self.image_target_sizes,
            valid_box_ratio_ranges=self.valid_box_ratio_ranges,
            chip_target_size=self.chip_target_size,
            chip_target_stride=self.chip_target_stride,
            use_neg_chip=self.use_neg_chip,
            max_neg_num_per_im=self.max_neg_num_per_im,
            max_per_img=self.max_per_img,
            nms_thresh=self.nms_thresh
        )

    def generate_chips_roidbs(self, roidbs, is_trainset):
        if is_trainset:
            roidbs = self.anno_cropper.crop_anno_records(roidbs)
        else:
            roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)
        return roidbs

    def _parse_proposals(self):
        if self.proposals_file:
            self.proposals = {}
            logger.info("Parse proposals file:{}".format(self.proposals_file))
            with open(self.proposals_file, 'r') as f:
                proposals = json.load(f)
            for prop in proposals:
                image_id = prop["image_id"]
                if image_id not in self.proposals:
                    self.proposals[image_id] = []
                x, y, w, h = prop["bbox"]
                self.proposals[image_id].append([x, y, x + w, y + h])

    def _merge_anno_proposals(self):
        assert self.roidbs
        if self.proposals and len(self.proposals.keys()) > 0:
            logger.info("merge proposals to annos")
            for id, record in enumerate(self.roidbs):
                image_id = int(record["im_id"])
                if image_id not in self.proposals.keys():
                    logger.info("image id :{} no proposals".format(image_id))
                record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32)
                self.roidbs[id] = record

    def get_ori_roidbs(self):
        if not hasattr(self, "ori_roidbs"):
            return None
        return self.ori_roidbs

    def get_roidbs(self):
        if not hasattr(self, "roidbs"):
            self.parse_dataset()
        return self.roidbs

    def set_roidbs(self, roidbs):
        self.roidbs = roidbs

    def check_or_download_dataset(self):
        return

    def _parse(self):
        image_dir = self.image_dir
        if not isinstance(image_dir, Sequence):
            image_dir = [image_dir]
        images = []
        for im_dir in image_dir:
            if os.path.isdir(im_dir):
                im_dir = os.path.join(self.dataset_dir, im_dir)
                images.extend(_make_dataset(im_dir))
            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
                images.append(im_dir)
        return images

    def _load_images(self):
        images = self._parse()
        ct = 0
        records = []
        for image in images:
            assert image != '' and os.path.isfile(image), \
                "Image {} not found".format(image)
            if self.sample_num > 0 and ct >= self.sample_num:
                break
            im = cv2.imread(image)
            h, w, c = im.shape
            rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w}
            self._imid2path[ct] = image
            ct += 1
            records.append(rec)
        assert len(records) > 0, "No image file found"
        return records

    def get_imid2path(self):
        return self._imid2path

    def set_images(self, images):
        self._imid2path = {}
        self.image_dir = images
        self.roidbs = self._load_images()


================================================
FILE: ppdet/data/source/voc.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np

import xml.etree.ElementTree as ET

from ppdet.core.workspace import register, serializable

from .dataset import DetDataset

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@register
@serializable
class VOCDataSet(DetDataset):
    """
    Load dataset with PascalVOC format.

    Notes:
    `anno_path` must contains xml file and image file path for annotations.

    Args:
        dataset_dir (str): root directory for dataset.
        image_dir (str): directory for images.
        anno_path (str): voc annotation file path.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        label_list (str): if use_default_label is False, will load
            mapping between category and class index.
        allow_empty (bool): whether to load empty entry. False as default
        empty_ratio (float): the ratio of empty record number to total 
            record's, if empty_ratio is out of [0. ,1.), do not sample the 
            records and use all the empty entries. 1. as default
        repeat (int): repeat times for dataset, use in benchmark.
    """

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 label_list=None,
                 allow_empty=False,
                 empty_ratio=1.,
                 repeat=1):
        super(VOCDataSet, self).__init__(
            dataset_dir=dataset_dir,
            image_dir=image_dir,
            anno_path=anno_path,
            data_fields=data_fields,
            sample_num=sample_num,
            repeat=repeat)
        self.label_list = label_list
        self.allow_empty = allow_empty
        self.empty_ratio = empty_ratio

    def _sample_empty(self, records, num):
        # if empty_ratio is out of [0. ,1.), do not sample the records
        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
            return records
        import random
        sample_num = min(
            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
        records = random.sample(records, sample_num)
        return records

    def parse_dataset(self, ):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        # mapping category name to class id
        # first_class:0, second_class:1, ...
        records = []
        empty_records = []
        ct = 0
        cname2cid = {}
        if self.label_list:
            label_path = os.path.join(self.dataset_dir, self.label_list)
            if not os.path.exists(label_path):
                raise ValueError("label_list {} does not exists".format(
                    label_path))
            with open(label_path, 'r') as fr:
                label_id = 0
                for line in fr.readlines():
                    cname2cid[line.strip()] = label_id
                    label_id += 1
        else:
            cname2cid = pascalvoc_label()

        with open(anno_path, 'r') as fr:
            while True:
                line = fr.readline()
                if not line:
                    break
                img_file, xml_file = [os.path.join(image_dir, x) \
                        for x in line.strip().split()[:2]]
                if not os.path.exists(img_file):
                    logger.warning(
                        'Illegal image file: {}, and it will be ignored'.format(
                            img_file))
                    continue
                if not os.path.isfile(xml_file):
                    logger.warning(
                        'Illegal xml file: {}, and it will be ignored'.format(
                            xml_file))
                    continue
                tree = ET.parse(xml_file)
                if tree.find('id') is None:
                    im_id = np.array([ct])
                else:
                    im_id = np.array([int(tree.find('id').text)])

                objs = tree.findall('object')
                im_w = float(tree.find('size').find('width').text)
                im_h = float(tree.find('size').find('height').text)
                if im_w < 0 or im_h < 0:
                    logger.warning(
                        'Illegal width: {} or height: {} in annotation, '
                        'and {} will be ignored'.format(im_w, im_h, xml_file))
                    continue

                num_bbox, i = len(objs), 0
                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
                for obj in objs:
                    cname = obj.find('name').text

                    # user dataset may not contain difficult field
                    _difficult = obj.find('difficult')
                    _difficult = int(
                        _difficult.text) if _difficult is not None else 0

                    x1 = float(obj.find('bndbox').find('xmin').text)
                    y1 = float(obj.find('bndbox').find('ymin').text)
                    x2 = float(obj.find('bndbox').find('xmax').text)
                    y2 = float(obj.find('bndbox').find('ymax').text)
                    x1 = max(0, x1)
                    y1 = max(0, y1)
                    x2 = min(im_w - 1, x2)
                    y2 = min(im_h - 1, y2)
                    if x2 > x1 and y2 > y1:
                        gt_bbox[i, :] = [x1, y1, x2, y2]
                        gt_class[i, 0] = cname2cid[cname]
                        gt_score[i, 0] = 1.
                        difficult[i, 0] = _difficult
                        i += 1
                    else:
                        logger.warning(
                            'Found an invalid bbox in annotations: xml_file: {}'
                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                                xml_file, x1, y1, x2, y2))
                gt_bbox = gt_bbox[:i, :]
                gt_class = gt_class[:i, :]
                gt_score = gt_score[:i, :]
                difficult = difficult[:i, :]

                voc_rec = {
                    'im_file': img_file,
                    'im_id': im_id,
                    'h': im_h,
                    'w': im_w
                } if 'image' in self.data_fields else {}

                gt_rec = {
                    'gt_class': gt_class,
                    'gt_score': gt_score,
                    'gt_bbox': gt_bbox,
                    'difficult': difficult
                }
                for k, v in gt_rec.items():
                    if k in self.data_fields:
                        voc_rec[k] = v

                if len(objs) == 0:
                    empty_records.append(voc_rec)
                else:
                    records.append(voc_rec)

                ct += 1
                if self.sample_num > 0 and ct >= self.sample_num:
                    break
        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
        logger.debug('{} samples in file {}'.format(ct, anno_path))
        if self.allow_empty and len(empty_records) > 0:
            empty_records = self._sample_empty(empty_records, len(records))
            records += empty_records
        self.roidbs, self.cname2cid = records, cname2cid

    def get_label_list(self):
        return os.path.join(self.dataset_dir, self.label_list)


def pascalvoc_label():
    labels_map = {
        'aeroplane': 0,
        'bicycle': 1,
        'bird': 2,
        'boat': 3,
        'bottle': 4,
        'bus': 5,
        'car': 6,
        'cat': 7,
        'chair': 8,
        'cow': 9,
        'diningtable': 10,
        'dog': 11,
        'horse': 12,
        'motorbike': 13,
        'person': 14,
        'pottedplant': 15,
        'sheep': 16,
        'sofa': 17,
        'train': 18,
        'tvmonitor': 19
    }
    return labels_map


================================================
FILE: ppdet/data/source/widerface.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
import os
import numpy as np
from scipy.io import loadmat

from ppdet.core.workspace import register, serializable
from .dataset import DetDataset

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@register
@serializable
class WIDERFaceDataSet(DetDataset):
    """
    Load WiderFace records with 'anno_path'

    Args:
        dataset_dir (str): root directory for dataset.
        image_dir (str): directory for images.
        anno_path (str): WiderFace annotation data.
        data_fields (list): key name of data dictionary, at least have 'image'.
        sample_num (int): number of samples to load, -1 means all.
        with_lmk (bool): whether to load face landmark keypoint labels.
    """

    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 with_lmk=False):
        super(WIDERFaceDataSet, self).__init__(
            dataset_dir=dataset_dir,
            image_dir=image_dir,
            anno_path=anno_path,
            data_fields=data_fields,
            sample_num=sample_num,
            with_lmk=with_lmk)
        self.anno_path = anno_path
        self.sample_num = sample_num
        self.roidbs = None
        self.cname2cid = None
        self.with_lmk = with_lmk

    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)

        txt_file = anno_path

        records = []
        ct = 0
        file_lists = self._load_file_list(txt_file)
        cname2cid = widerface_label()

        for item in file_lists:
            im_fname = item[0]
            im_id = np.array([ct])
            gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)
            gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)
            gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)
            lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)
            for index_box in range(len(item)):
                if index_box < 1:
                    continue
                gt_bbox[index_box - 1] = item[index_box][0]
                if self.with_lmk:
                    gt_lmk_labels[index_box - 1] = item[index_box][1]
                    lmk_ignore_flag[index_box - 1] = item[index_box][2]
            im_fname = os.path.join(image_dir,
                                    im_fname) if image_dir else im_fname
            widerface_rec = {
                'im_file': im_fname,
                'im_id': im_id,
            } if 'image' in self.data_fields else {}
            gt_rec = {
                'gt_bbox': gt_bbox,
                'gt_class': gt_class,
            }
            for k, v in gt_rec.items():
                if k in self.data_fields:
                    widerface_rec[k] = v
            if self.with_lmk:
                widerface_rec['gt_keypoint'] = gt_lmk_labels
                widerface_rec['keypoint_ignore'] = lmk_ignore_flag

            if len(item) != 0:
                records.append(widerface_rec)

            ct += 1
            if self.sample_num > 0 and ct >= self.sample_num:
                break
        assert len(records) > 0, 'not found any widerface in %s' % (anno_path)
        logger.debug('{} samples in file {}'.format(ct, anno_path))
        self.roidbs, self.cname2cid = records, cname2cid

    def _load_file_list(self, input_txt):
        with open(input_txt, 'r') as f_dir:
            lines_input_txt = f_dir.readlines()

        file_dict = {}
        num_class = 0
        exts = ['jpg', 'jpeg', 'png', 'bmp']
        exts += [ext.upper() for ext in exts]
        for i in range(len(lines_input_txt)):
            line_txt = lines_input_txt[i].strip('\n\t\r')
            split_str = line_txt.split(' ')
            if len(split_str) == 1:
                img_file_name = os.path.split(split_str[0])[1]
                split_txt = img_file_name.split('.')
                if len(split_txt) < 2:
                    continue
                elif split_txt[-1] in exts:
                    if i != 0:
                        num_class += 1
                    file_dict[num_class] = [line_txt]
            else:
                if len(line_txt) <= 6:
                    continue
                result_boxs = []
                xmin = float(split_str[0])
                ymin = float(split_str[1])
                w = float(split_str[2])
                h = float(split_str[3])
                # Filter out wrong labels
                if w < 0 or h < 0:
                    logger.warning('Illegal box with w: {}, h: {} in '
                                   'img: {}, and it will be ignored'.format(
                                       w, h, file_dict[num_class][0]))
                    continue
                xmin = max(0, xmin)
                ymin = max(0, ymin)
                xmax = xmin + w
                ymax = ymin + h
                gt_bbox = [xmin, ymin, xmax, ymax]
                result_boxs.append(gt_bbox)
                if self.with_lmk:
                    assert len(split_str) > 18, 'When `with_lmk=True`, the number' \
                            'of characters per line in the annotation file should' \
                            'exceed 18.'
                    lmk0_x = float(split_str[5])
                    lmk0_y = float(split_str[6])
                    lmk1_x = float(split_str[8])
                    lmk1_y = float(split_str[9])
                    lmk2_x = float(split_str[11])
                    lmk2_y = float(split_str[12])
                    lmk3_x = float(split_str[14])
                    lmk3_y = float(split_str[15])
                    lmk4_x = float(split_str[17])
                    lmk4_y = float(split_str[18])
                    lmk_ignore_flag = 0 if lmk0_x == -1 else 1
                    gt_lmk_label = [
                        lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,
                        lmk3_y, lmk4_x, lmk4_y
                    ]
                    result_boxs.append(gt_lmk_label)
                    result_boxs.append(lmk_ignore_flag)
                file_dict[num_class].append(result_boxs)

        return list(file_dict.values())


def widerface_label():
    labels_map = {'face': 0}
    return labels_map


@register
@serializable
class WIDERFaceValDataset(WIDERFaceDataSet):
    def __init__(self,
                 dataset_dir=None,
                 image_dir=None,
                 anno_path=None,
                 gt_mat_path=None,
                 data_fields=['image'],
                 sample_num=-1,
                 with_lmk=False):
        super().__init__(
            dataset_dir=dataset_dir,
            image_dir=image_dir,
            anno_path=anno_path,
            data_fields=data_fields,
            sample_num=sample_num,
            with_lmk=with_lmk)
        self.gt_mat_path = gt_mat_path
        self.val_mat = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_face_val.mat')
        self.hard_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_hard_val.mat')
        self.medium_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_medium_val.mat')
        self.easy_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_easy_val.mat')

        assert os.path.exists(self.val_mat), f'{self.val_mat} not exist'
        assert os.path.exists(self.hard_mat_path), f'{self.hard_mat_path} not exist'
        assert os.path.exists(self.medium_mat_path), f'{self.medium_mat_path} not exist'
        assert os.path.exists(self.easy_mat_path), f'{self.easy_mat_path} not exist'

    def parse_dataset(self):
        super().parse_dataset()

        box_list, flie_list, event_list, hard_info_list, medium_info_list, \
            easy_info_list = self.get_gt_infos()
        setting_infos = [easy_info_list, medium_info_list, hard_info_list]
        settings = ['easy', 'medium', 'hard']
        info_by_name = defaultdict(dict)
        for setting_id in range(3):
            info_list = setting_infos[setting_id]
            setting = settings[setting_id]
            for i in range(len(event_list)):
                img_list = flie_list[i][0]
                gt_box_list = box_list[i][0]
                sub_info_list = info_list[i][0]
                for j in range(len(img_list)):
                    img_name = str(img_list[j][0][0])
                    gt_boxes = gt_box_list[j][0].astype(np.float32)
                    info_by_name[img_name]['gt_ori_bbox'] = gt_boxes

                    keep_index = sub_info_list[j][0]
                    ignore = np.zeros(gt_boxes.shape[0])
                    if len(keep_index) != 0:
                        ignore[keep_index-1] = 1
                    info_by_name[img_name][f'gt_{setting}_ignore'] = ignore

        for roidb in self.roidbs:
            img_file = roidb['im_file'].split('/')[-1]
            img_name = ".".join(img_file.split(".")[:-1])
            roidb.update(info_by_name[img_name])

    def get_gt_infos(self):
        """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""

        val_mat = loadmat(self.val_mat)
        hard_mat = loadmat(self.hard_mat_path)
        medium_mat = loadmat(self.medium_mat_path)
        easy_mat = loadmat(self.easy_mat_path)

        box_list = val_mat['face_bbx_list']
        file_list = val_mat['file_list']
        event_list = val_mat['event_list']

        hard_info_list = hard_mat['gt_list']
        medium_info_list = medium_mat['gt_list']
        easy_info_list = easy_mat['gt_list']

        return box_list, file_list, event_list, hard_info_list, medium_info_list, easy_info_list

================================================
FILE: ppdet/data/transform/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import operators
from . import batch_operators
from . import keypoint_operators
from . import mot_operators
from . import rotated_operators
from . import keypoints_3d_operators
from . import culane_operators

from .operators import *
from .batch_operators import *
from .keypoint_operators import *
from .mot_operators import *
from .rotated_operators import *
from .keypoints_3d_operators import *
from .culane_operators import *

__all__ = []
__all__ += registered_ops
__all__ += keypoint_operators.__all__
__all__ += mot_operators.__all__
__all__ += culane_operators.__all__


================================================
FILE: ppdet/data/transform/atss_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
    """Calculate overlap between two set of bboxes.
    If ``is_aligned `` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.
    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned `` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or "iof" (intersection over
            foreground).
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.
    Returns:
        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
    """
    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
        mode)
    # Either the boxes are empty or the length of boxes's last dimenstion is 4
    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return np.random.random(batch_shape + (rows, ))
        else:
            return np.random.random(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
        bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
        bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == 'giou':
            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
        if mode == 'diou':
            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
    else:
        lt = np.maximum(bboxes1[..., :, None, :2],
                        bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = np.minimum(bboxes1[..., :, None, 2:],
                        bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1[..., None] + area2[..., None, :] - overlap
        else:
            union = area1[..., None]
        if mode == 'giou':
            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
                                     bboxes2[..., None, :, :2])
            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
                                     bboxes2[..., None, :, 2:])
        if mode == 'diou':
            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
                                     bboxes2[..., None, :, :2])
            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
                                     bboxes2[..., None, :, 2:])
            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]

    eps = np.array([eps])
    union = np.maximum(union, eps)
    ious = overlap / union
    if mode in ['iou', 'iof']:
        return ious
    # calculate gious
    if mode in ['giou']:
        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
        enclose_area = np.maximum(enclose_area, eps)
        gious = ious - (enclose_area - union) / enclose_area
        return gious
    if mode in ['diou']:
        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
        rho2 = left + right
        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
        enclose_c = np.maximum(enclose_c, eps)
        dious = ious - rho2 / enclose_c
        return dious


def topk_(input, k, axis=1, largest=True):
    x = -input if largest else input
    if axis == 0:
        row_index = np.arange(input.shape[1 - axis])
        if k == x.shape[0]:  # argpartition requires index < len(input)
            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
        else:
            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]

        topk_data = x[topk_index, row_index]

        topk_index_sort = np.argsort(topk_data, axis=axis)
        topk_data_sort = topk_data[topk_index_sort, row_index]
        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
    else:
        column_index = np.arange(x.shape[1 - axis])[:, None]
        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
        topk_data = x[column_index, topk_index]
        topk_data = -topk_data if largest else topk_data
        topk_index_sort = np.argsort(topk_data, axis=axis)
        topk_data_sort = topk_data[column_index, topk_index_sort]
        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]

    return topk_data_sort, topk_index_sort


class ATSSAssigner(object):
    """Assign a corresponding gt bbox or background to each bbox.

    Each proposals will be assigned with `0` or a positive integer
    indicating the ground truth index.

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        topk (float): number of bbox selected in each level
    """

    def __init__(self, topk=9):
        self.topk = topk

    def __call__(self,
                 bboxes,
                 num_level_bboxes,
                 gt_bboxes,
                 gt_bboxes_ignore=None,
                 gt_labels=None):
        """Assign gt to bboxes.
        The assignment is done in following steps
        1. compute iou between all bbox (bbox of all pyramid levels) and gt
        2. compute center distance between all bbox and gt
        3. on each pyramid level, for each gt, select k bbox whose center
           are closest to the gt center, so we total select k*l bbox as
           candidates for each gt
        4. get corresponding iou for the these candidates, and compute the
           mean and std, set mean + std as the iou threshold
        5. select these candidates whose iou are greater than or equal to
           the threshold as postive
        6. limit the positive sample's center in gt
        Args:
            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
            num_level_bboxes (List): num of bboxes in each level
            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
                labelled as `ignored`, e.g., crowd boxes in COCO.
            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
        """
        bboxes = bboxes[:, :4]
        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]

        # assign 0 by default
        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)

        if num_gt == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            max_overlaps = np.zeros((num_bboxes, ))
            if num_gt == 0:
                # No truth, assign everything to background
                assigned_gt_inds[:] = 0
            if not np.any(gt_labels):
                assigned_labels = None
            else:
                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
            return assigned_gt_inds, max_overlaps

        # compute iou between all bbox and gt
        overlaps = bbox_overlaps(bboxes, gt_bboxes)
        # compute center distance between all bbox and gt
        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
        gt_points = np.stack((gt_cx, gt_cy), axis=1)

        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)

        distances = np.sqrt(
            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
            .sum(-1))

        # Selecting candidates based on the center distance
        candidate_idxs = []
        start_idx = 0
        for bboxes_per_level in num_level_bboxes:
            # on each pyramid level, for each gt,
            # select k bbox whose center are closest to the gt center
            end_idx = start_idx + bboxes_per_level
            distances_per_level = distances[start_idx:end_idx, :]
            selectable_k = min(self.topk, bboxes_per_level)
            _, topk_idxs_per_level = topk_(
                distances_per_level, selectable_k, axis=0, largest=False)
            candidate_idxs.append(topk_idxs_per_level + start_idx)
            start_idx = end_idx
        candidate_idxs = np.concatenate(candidate_idxs, axis=0)

        # get corresponding iou for the these candidates, and compute the
        # mean and std, set mean + std as the iou threshold
        candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]
        overlaps_mean_per_gt = candidate_overlaps.mean(0)
        overlaps_std_per_gt = candidate_overlaps.std(0)
        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt

        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]

        # limit the positive sample's center in gt
        for gt_idx in range(num_gt):
            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
        ep_bboxes_cx = np.broadcast_to(
            bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
        ep_bboxes_cy = np.broadcast_to(
            bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
        candidate_idxs = candidate_idxs.reshape(-1)

        # calculate the left, top, right, bottom distance between positive
        # bbox center and gt side
        l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]
        t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]
        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)
        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)
        is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01
        is_pos = is_pos & is_in_gts

        # if an anchor box is assigned to multiple gts,
        # the one with the highest IoU will be selected.
        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T

        max_overlaps = overlaps_inf.max(axis=1)
        argmax_overlaps = overlaps_inf.argmax(axis=1)
        assigned_gt_inds[max_overlaps !=
                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1

        return assigned_gt_inds, max_overlaps

    def get_vlr_region(self,
                       bboxes,
                       num_level_bboxes,
                       gt_bboxes,
                       gt_bboxes_ignore=None,
                       gt_labels=None):
        """get vlr region for ld distillation.
        Args:
            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
            num_level_bboxes (List): num of bboxes in each level
            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
                labelled as `ignored`, e.g., crowd boxes in COCO.
            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
        """
        bboxes = bboxes[:, :4]

        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]

        # compute iou between all bbox and gt
        overlaps = bbox_overlaps(bboxes, gt_bboxes)

        # compute diou between all bbox and gt
        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')

        # assign 0 by default
        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)

        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)

        if num_gt == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            max_overlaps = np.zeros((num_bboxes, ))
            if num_gt == 0:
                # No truth, assign everything to background
                assigned_gt_inds[:] = 0
            if not np.any(gt_labels):
                assigned_labels = None
            else:
                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
            return assigned_gt_inds, max_overlaps

        # compute center distance between all bbox and gt
        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
        gt_points = np.stack((gt_cx, gt_cy), axis=1)

        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)

        distances = np.sqrt(
            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
            .sum(-1))

        # Selecting candidates based on the center distance
        candidate_idxs = []
        candidate_idxs_t = []
        start_idx = 0
        for bboxes_per_level in num_level_bboxes:
            # on each pyramid level, for each gt,
            # select k bbox whose center are closest to the gt center
            end_idx = start_idx + bboxes_per_level
            distances_per_level = distances[start_idx:end_idx, :]
            selectable_t = min(self.topk, bboxes_per_level)
            selectable_k = bboxes_per_level  #k for all
            _, topt_idxs_per_level = topk_(
                distances_per_level, selectable_t, axis=0, largest=False)
            _, topk_idxs_per_level = topk_(
                distances_per_level, selectable_k, axis=0, largest=False)
            candidate_idxs_t.append(topt_idxs_per_level + start_idx)
            candidate_idxs.append(topk_idxs_per_level + start_idx)
            start_idx = end_idx

        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
        candidate_idxs = np.concatenate(candidate_idxs, axis=0)

        # get corresponding iou for the these candidates, and compute the
        # mean and std, set mean + std as the iou threshold
        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]

        # compute tdiou
        t_diou = diou[candidate_idxs, np.arange(num_gt)]

        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
        overlaps_std_per_gt = candidate_overlaps_t.std(
            0, ddof=1)  # NOTE: use Bessel correction
        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt

        # compute region        
        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])

        # limit the positive sample's center in gt
        for gt_idx in range(num_gt):
            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes

        candidate_idxs = candidate_idxs.reshape(-1)

        # if an anchor box is assigned to multiple gts,
        # the one with the highest IoU will be selected.
        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]

        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T

        max_overlaps = overlaps_inf.max(axis=1)
        argmax_overlaps = overlaps_inf.argmax(axis=1)

        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T

        assigned_gt_inds[max_overlaps !=
                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1

        vlr_region_iou[max_overlaps !=
                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0

        return vlr_region_iou


================================================
FILE: ppdet/data/transform/autoaugment_utils.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Reference: 
#   https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py
"""AutoAugment util file."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import inspect
import math
from PIL import Image, ImageEnhance
import numpy as np
import cv2
from copy import deepcopy

# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.

# Represents an invalid bounding box that is used for checking for padding
# lists of bounding box coordinates for a few augmentation operations
_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]


def policy_v0():
    """Autoaugment policy that was used in AutoAugment Detection Paper."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [
        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
    ]
    return policy


def policy_v1():
    """Autoaugment policy that was used in AutoAugment Detection Paper."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [
        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
        [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],
        [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],
        [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],
        [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],
        [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)],  # , 
        [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],
        [('Color', 1.0, 6), ('Equalize', 1.0, 2)],
        [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],
        [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],
        [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],
        [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],
        [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],
        [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],
        [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],
        [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],
    ]
    return policy


def policy_vtest():
    """Autoaugment test policy for debugging."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]
    return policy


def policy_v2():
    """Additional policy that performs well on object detection."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [
        [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],
        [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),
         ('Rotate_BBox', 0.8, 10)],
        [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],
        [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),
         ('Brightness', 0.0, 10)],
        [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),
         ('AutoContrast', 0.6, 0)],
        [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],
        [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),
         ('Solarize', 0.0, 10)],
        [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),
         ('Rotate_BBox', 0.8, 8)],
        [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],
        [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),
         ('Rotate_BBox', 0.6, 6)],
        [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)],
        [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),
         ('ShearY_BBox', 0.6, 8)],
        [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),
         ('Brightness', 0.2, 2)],
        [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),
         ('SolarizeAdd', 0.2, 10)],
        [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],
    ]
    return policy


def policy_v3():
    """"Additional policy that performs well on object detection."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [
        [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],
        [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],
        [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],
        [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],
        [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],
        [('Sharpness', 0.0, 2), ('Color', 0.4, 8)],
        [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],
        [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],
        [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],
        [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],
        [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],
        [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],
        [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],
        [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],
        [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],
    ]
    return policy


def _equal(val1, val2, eps=1e-8):
    return abs(val1 - val2) <= eps


def blend(image1, image2, factor):
    """Blend image1 and image2 using 'factor'.

    Factor can be above 0.0.    A value of 0.0 means only image1 is used.
    A value of 1.0 means only image2 is used.    A value between 0.0 and
    1.0 means we linearly interpolate the pixel values between the two
    images.    A value greater than 1.0 "extrapolates" the difference
    between the two pixel values, and we clip the results to values
    between 0 and 255.

    Args:
        image1: An image Tensor of type uint8.
        image2: An image Tensor of type uint8.
        factor: A floating point value above 0.0.

    Returns:
        A blended image Tensor of type uint8.
    """
    if factor == 0.0:
        return image1
    if factor == 1.0:
        return image2

    image1 = image1.astype(np.float32)
    image2 = image2.astype(np.float32)

    difference = image2 - image1
    scaled = factor * difference

    # Do addition in float.
    temp = image1 + scaled

    # Interpolate
    if factor > 0.0 and factor < 1.0:
        # Interpolation means we always stay within 0 and 255.
        return temp.astype(np.uint8)

    # Extrapolate:
    #
    # We need to clip and then cast.
    return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)


def cutout(image, pad_size, replace=0):
    """Apply cutout (https://arxiv.org/abs/1708.04552) to image.

    This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
    a random location within `img`. The pixel values filled in will be of the
    value `replace`. The located where the mask will be applied is randomly
    chosen uniformly over the whole image.

    Args:
        image: An image Tensor of type uint8.
        pad_size: Specifies how big the zero mask that will be generated is that
            is applied to the image. The mask will be of size
            (2*pad_size x 2*pad_size).
        replace: What pixel value to fill in the image in the area that has
            the cutout mask applied to it.

    Returns:
        An image Tensor that is of type uint8.
    Example:
        img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB )
        new_img = cutout(img, pad_size=50, replace=0)
    """
    image_height, image_width = image.shape[0], image.shape[1]

    cutout_center_height = np.random.randint(low=0, high=image_height)
    cutout_center_width = np.random.randint(low=0, high=image_width)

    lower_pad = np.maximum(0, cutout_center_height - pad_size)
    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
    left_pad = np.maximum(0, cutout_center_width - pad_size)
    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)

    cutout_shape = [
        image_height - (lower_pad + upper_pad),
        image_width - (left_pad + right_pad)
    ]
    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
    mask = np.pad(np.zeros(
        cutout_shape, dtype=image.dtype),
                  padding_dims,
                  'constant',
                  constant_values=1)
    mask = np.expand_dims(mask, -1)
    mask = np.tile(mask, [1, 1, 3])
    image = np.where(
        np.equal(mask, 0),
        np.ones_like(
            image, dtype=image.dtype) * replace,
        image)
    return image.astype(np.uint8)


def solarize(image, threshold=128):
    # For each pixel in the image, select the pixel
    # if the value is less than the threshold.
    # Otherwise, subtract 255 from the pixel.
    return np.where(image < threshold, image, 255 - image)


def solarize_add(image, addition=0, threshold=128):
    # For each pixel in the image less than threshold
    # we add 'addition' amount to it and then clip the
    # pixel value to be between 0 and 255. The value
    # of 'addition' is between -128 and 128.
    added_image = image.astype(np.int64) + addition
    added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)
    return np.where(image < threshold, added_image, image)


def color(image, factor):
    """use cv2 to deal"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
    return blend(degenerate, image, factor)


# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197
def contrast(img, factor):
    img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)
    return np.array(img)


def brightness(image, factor):
    """Equivalent of PIL Brightness."""
    degenerate = np.zeros_like(image)
    return blend(degenerate, image, factor)


def posterize(image, bits):
    """Equivalent of PIL Posterize."""
    shift = 8 - bits
    return np.left_shift(np.right_shift(image, shift), shift)


def rotate(image, degrees, replace):
    """Rotates the image by degrees either clockwise or counterclockwise.

    Args:
        image: An image Tensor of type uint8.
        degrees: Float, a scalar angle in degrees to rotate all images by. If
            degrees is positive the image will be rotated clockwise otherwise it will
            be rotated counterclockwise.
        replace: A one or three value 1D tensor to fill empty pixels caused by
            the rotate operation.

    Returns:
        The rotated version of image.
    """
    image = wrap(image)
    image = Image.fromarray(image)
    image = image.rotate(degrees)
    image = np.array(image, dtype=np.uint8)
    return unwrap(image, replace)


def random_shift_bbox(image,
                      bbox,
                      pixel_scaling,
                      replace,
                      new_min_bbox_coords=None):
    """Move the bbox and the image content to a slightly new random location.

    Args:
        image: 3D uint8 Tensor.
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
            The potential values for the new min corner of the bbox will be between
            [old_min - pixel_scaling * bbox_height/2,
             old_min - pixel_scaling * bbox_height/2].
        pixel_scaling: A float between 0 and 1 that specifies the pixel range
            that the new bbox location will be sampled from.
        replace: A one or three value 1D tensor to fill empty pixels.
        new_min_bbox_coords: If not None, then this is a tuple that specifies the
            (min_y, min_x) coordinates of the new bbox. Normally this is randomly
            specified, but this allows it to be manually set. The coordinates are
            the absolute coordinates between 0 and image height/width and are int32.

    Returns:
        The new image that will have the shifted bbox location in it along with
        the new bbox that contains the new coordinates.
    """
    # Obtains image height and width and create helper clip functions.
    image_height, image_width = image.shape[0], image.shape[1]
    image_height = float(image_height)
    image_width = float(image_width)

    def clip_y(val):
        return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)

    def clip_x(val):
        return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)

    # Convert bbox to pixel coordinates.
    min_y = int(image_height * bbox[0])
    min_x = int(image_width * bbox[1])
    max_y = clip_y(image_height * bbox[2])
    max_x = clip_x(image_width * bbox[3])

    bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)
    image_height = int(image_height)
    image_width = int(image_width)

    # Select the new min/max bbox ranges that are used for sampling the
    # new min x/y coordinates of the shifted bbox.
    minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /
                                       2.0))
    maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /
                                       2.0))
    minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0))
    maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0))

    # Sample and calculate the new unclipped min/max coordinates of the new bbox.
    if new_min_bbox_coords is None:
        unclipped_new_min_y = np.random.randint(
            low=minval_y, high=maxval_y, dtype=np.int32)
        unclipped_new_min_x = np.random.randint(
            low=minval_x, high=maxval_x, dtype=np.int32)
    else:
        unclipped_new_min_y, unclipped_new_min_x = (
            clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))
    unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1
    unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1

    # Determine if any of the new bbox was shifted outside the current image.
    # This is used for determining if any of the original bbox content should be
    # discarded.
    new_min_y, new_min_x, new_max_y, new_max_x = (
        clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),
        clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))
    shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y
    shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)
    shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x
    shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)

    # Create the new bbox tensor by converting pixel integer values to floats.
    new_bbox = np.stack([
        float(new_min_y) / float(image_height), float(new_min_x) /
        float(image_width), float(new_max_y) / float(image_height),
        float(new_max_x) / float(image_width)
    ])

    # Copy the contents in the bbox and fill the old bbox location
    # with gray (128).
    bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:
                         shifted_max_x + 1, :]

    def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor,
                           image_):
        """Applies mask to bbox region in image then adds content_tensor to it."""
        mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],
                             [min_x_, (image_width - 1) - max_x_], [0, 0]],
                      'constant',
                      constant_values=1)

        content_tensor = np.pad(content_tensor,
                                [[min_y_, (image_height - 1) - max_y_],
                                 [min_x_, (image_width - 1) - max_x_], [0, 0]],
                                'constant',
                                constant_values=0)
        return image_ * mask + content_tensor

    # Zero out original bbox location.
    mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]
    grey_tensor = np.zeros_like(mask) + replace[0]
    image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,
                               image)

    # Fill in bbox content to new bbox location.
    mask = np.zeros_like(bbox_content)
    image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask,
                               bbox_content, image)

    return image.astype(np.uint8), new_bbox


def _clip_bbox(min_y, min_x, max_y, max_x):
    """Clip bounding box coordinates between 0 and 1.

    Args:
        min_y: Normalized bbox coordinate of type float between 0 and 1.
        min_x: Normalized bbox coordinate of type float between 0 and 1.
        max_y: Normalized bbox coordinate of type float between 0 and 1.
        max_x: Normalized bbox coordinate of type float between 0 and 1.

    Returns:
        Clipped coordinate values between 0 and 1.
    """
    min_y = np.clip(min_y, a_min=0, a_max=1.0)
    min_x = np.clip(min_x, a_min=0, a_max=1.0)
    max_y = np.clip(max_y, a_min=0, a_max=1.0)
    max_x = np.clip(max_x, a_min=0, a_max=1.0)
    return min_y, min_x, max_y, max_x


def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
    """Adjusts bbox coordinates to make sure the area is > 0.

    Args:
        min_y: Normalized bbox coordinate of type float between 0 and 1.
        min_x: Normalized bbox coordinate of type float between 0 and 1.
        max_y: Normalized bbox coordinate of type float between 0 and 1.
        max_x: Normalized bbox coordinate of type float between 0 and 1.
        delta: Float, this is used to create a gap of size 2 * delta between
            bbox min/max coordinates that are the same on the boundary.
            This prevents the bbox from having an area of zero.

    Returns:
        Tuple of new bbox coordinates between 0 and 1 that will now have a
        guaranteed area > 0.
    """
    height = max_y - min_y
    width = max_x - min_x

    def _adjust_bbox_boundaries(min_coord, max_coord):
        # Make sure max is never 0 and min is never 1.
        max_coord = np.maximum(max_coord, 0.0 + delta)
        min_coord = np.minimum(min_coord, 1.0 - delta)
        return min_coord, max_coord

    if _equal(height, 0):
        min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)

    if _equal(width, 0):
        min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)

    return min_y, min_x, max_y, max_x


def _scale_bbox_only_op_probability(prob):
    """Reduce the probability of the bbox-only operation.

    Probability is reduced so that we do not distort the content of too many
    bounding boxes that are close to each other. The value of 3.0 was a chosen
    hyper parameter when designing the autoaugment algorithm that we found
    empirically to work well.

    Args:
        prob: Float that is the probability of applying the bbox-only operation.

    Returns:
        Reduced probability.
    """
    return prob / 3.0


def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
    """Applies augmentation_func to the subsection of image indicated by bbox.

    Args:
        image: 3D uint8 Tensor.
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        augmentation_func: Augmentation function that will be applied to the
            subsection of image.
        *args: Additional parameters that will be passed into augmentation_func
            when it is called.

    Returns:
        A modified version of image, where the bbox location in the image will
        have `ugmentation_func applied to it.
    """
    image_height = image.shape[0]
    image_width = image.shape[1]

    min_y = int(image_height * bbox[0])
    min_x = int(image_width * bbox[1])
    max_y = int(image_height * bbox[2])
    max_x = int(image_width * bbox[3])

    # Clip to be sure the max values do not fall out of range.
    max_y = np.minimum(max_y, image_height - 1)
    max_x = np.minimum(max_x, image_width - 1)

    # Get the sub-tensor that is the image within the bounding box region.
    bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]

    # Apply the augmentation function to the bbox portion of the image.
    augmented_bbox_content = augmentation_func(bbox_content, *args)

    # Pad the augmented_bbox_content and the mask to match the shape of original
    # image.
    augmented_bbox_content = np.pad(
        augmented_bbox_content, [[min_y, (image_height - 1) - max_y],
                                 [min_x, (image_width - 1) - max_x], [0, 0]],
        'constant',
        constant_values=1)

    # Create a mask that will be used to zero out a part of the original image.
    mask_tensor = np.zeros_like(bbox_content)

    mask_tensor = np.pad(mask_tensor,
                         [[min_y, (image_height - 1) - max_y],
                          [min_x, (image_width - 1) - max_x], [0, 0]],
                         'constant',
                         constant_values=1)
    # Replace the old bbox content with the new augmented content.
    image = image * mask_tensor + augmented_bbox_content
    return image.astype(np.uint8)


def _concat_bbox(bbox, bboxes):
    """Helper function that concates bbox to bboxes along the first dimension."""

    # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
    # we discard bboxes and start the bboxes Tensor with the current bbox.
    bboxes_sum_check = np.sum(bboxes)
    bbox = np.expand_dims(bbox, 0)
    # This check will be true when it is an _INVALID_BOX
    if _equal(bboxes_sum_check, -4):
        bboxes = bbox
    else:
        bboxes = np.concatenate([bboxes, bbox], 0)
    return bboxes


def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
                                     augmentation_func, func_changes_bbox,
                                     *args):
    """Applies _apply_bbox_augmentation with probability prob.

    Args:
        image: 3D uint8 Tensor.
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
            have been altered by aug_func. These will only be changed when
            func_changes_bbox is set to true. Each bbox has 4 elements
            (min_y, min_x, max_y, max_x) of type float that are the normalized
            bbox coordinates between 0 and 1.
        prob: Float that is the probability of applying _apply_bbox_augmentation.
        augmentation_func: Augmentation function that will be applied to the
            subsection of image.
        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
            to image.
        *args: Additional parameters that will be passed into augmentation_func
            when it is called.

    Returns:
        A tuple. Fist element is a modified version of image, where the bbox
        location in the image will have augmentation_func applied to it if it is
        chosen to be called with probability `prob`. The second element is a
        Tensor of Tensors of length 4 that will contain the altered bbox after
        applying augmentation_func.
    """
    should_apply_op = (np.random.rand() + prob >= 1)
    if func_changes_bbox:
        if should_apply_op:
            augmented_image, bbox = augmentation_func(image, bbox, *args)
        else:
            augmented_image, bbox = (image, bbox)
    else:
        if should_apply_op:
            augmented_image = _apply_bbox_augmentation(image, bbox,
                                                       augmentation_func, *args)
        else:
            augmented_image = image
    new_bboxes = _concat_bbox(bbox, new_bboxes)
    return augmented_image.astype(np.uint8), new_bboxes


def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
                                   func_changes_bbox, *args):
    """Applies aug_func to the image for each bbox in bboxes.

    Args:
        image: 3D uint8 Tensor.
        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
            has 4 elements (min_y, min_x, max_y, max_x) of type float.
        prob: Float that is the probability of applying aug_func to a specific
            bounding box within the image.
        aug_func: Augmentation function that will be applied to the
            subsections of image indicated by the bbox values in bboxes.
        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
            to image.
        *args: Additional parameters that will be passed into augmentation_func
            when it is called.

    Returns:
        A modified version of image, where each bbox location in the image will
        have augmentation_func applied to it if it is chosen to be called with
        probability prob independently across all bboxes. Also the final
        bboxes are returned that will be unchanged if func_changes_bbox is set to
        false and if true, the new altered ones will be returned.
    """
    # Will keep track of the new altered bboxes after aug_func is repeatedly
    # applied. The -1 values are a dummy value and this first Tensor will be
    # removed upon appending the first real bbox.
    new_bboxes = np.array(_INVALID_BOX)

    # If the bboxes are empty, then just give it _INVALID_BOX. The result
    # will be thrown away.
    bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes

    assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!"

    # pylint:disable=g-long-lambda
    # pylint:disable=line-too-long
    wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)
    # pylint:enable=g-long-lambda
    # pylint:enable=line-too-long

    # Setup the while_loop.
    num_bboxes = bboxes.shape[0]  # We loop until we go over all bboxes.
    idx = 0  # Counter for the while loop.

    # Conditional function when to end the loop once we go over all bboxes
    # images_and_bboxes contain (_image, _new_bboxes)
    def cond(_idx, _images_and_bboxes):
        return _idx < num_bboxes

    # Shuffle the bboxes so that the augmentation order is not deterministic if
    # we are not changing the bboxes with aug_func.
    # if not func_changes_bbox:
    #     print(bboxes)
    #     loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)
    #     print(loop_bboxes)
    # else:
    #     loop_bboxes = bboxes
    # we can not shuffle the bbox because it does not contain class information here
    loop_bboxes = deepcopy(bboxes)

    # Main function of while_loop where we repeatedly apply augmentation on the
    # bboxes in the image.
    # pylint:disable=g-long-lambda
    body = lambda _idx, _images_and_bboxes: [
            _idx + 1, wrapped_aug_func(_images_and_bboxes[0],
                                         loop_bboxes[_idx],
                                         _images_and_bboxes[1])]
    while (cond(idx, (image, new_bboxes))):
        idx, (image, new_bboxes) = body(idx, (image, new_bboxes))

    # Either return the altered bboxes or the original ones depending on if
    # we altered them in anyway.
    if func_changes_bbox:
        final_bboxes = new_bboxes
    else:
        final_bboxes = bboxes
    return image, final_bboxes


def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
                                           func_changes_bbox, *args):
    """Checks to be sure num bboxes > 0 before calling inner function."""
    num_bboxes = len(bboxes)
    new_image = deepcopy(image)
    new_bboxes = deepcopy(bboxes)
    if num_bboxes != 0:
        new_image, new_bboxes = _apply_multi_bbox_augmentation(
            new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)
    return new_image, new_bboxes


def rotate_only_bboxes(image, bboxes, prob, degrees, replace):
    """Apply rotate to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)


def shear_x_only_bboxes(image, bboxes, prob, level, replace):
    """Apply shear_x to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, shear_x, func_changes_bbox, level, replace)


def shear_y_only_bboxes(image, bboxes, prob, level, replace):
    """Apply shear_y to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, shear_y, func_changes_bbox, level, replace)


def translate_x_only_bboxes(image, bboxes, prob, pixels, replace):
    """Apply translate_x to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)


def translate_y_only_bboxes(image, bboxes, prob, pixels, replace):
    """Apply translate_y to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)


def flip_only_bboxes(image, bboxes, prob):
    """Apply flip_lr to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
                                                  np.fliplr, func_changes_bbox)


def solarize_only_bboxes(image, bboxes, prob, threshold):
    """Apply solarize to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize,
                                                  func_changes_bbox, threshold)


def equalize_only_bboxes(image, bboxes, prob):
    """Apply equalize to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize,
                                                  func_changes_bbox)


def cutout_only_bboxes(image, bboxes, prob, pad_size, replace):
    """Apply cutout to each bbox in the image with probability prob."""
    func_changes_bbox = False
    prob = _scale_bbox_only_op_probability(prob)
    return _apply_multi_bbox_augmentation_wrapper(
        image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)


def _rotate_bbox(bbox, image_height, image_width, degrees):
    """Rotates the bbox coordinated by degrees.

    Args:
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        image_height: Int, height of the image.
        image_width: Int, height of the image.
        degrees: Float, a scalar angle in degrees to rotate all images by. If
            degrees is positive the image will be rotated clockwise otherwise it will
            be rotated counterclockwise.

    Returns:
        A tensor of the same shape as bbox, but now with the rotated coordinates.
    """
    image_height, image_width = (float(image_height), float(image_width))

    # Convert from degrees to radians.
    degrees_to_radians = math.pi / 180.0
    radians = degrees * degrees_to_radians

    # Translate the bbox to the center of the image and turn the normalized 0-1
    # coordinates to absolute pixel locations.
    # Y coordinates are made negative as the y axis of images goes down with
    # increasing pixel values, so we negate to make sure x axis and y axis points
    # are in the traditionally positive direction.
    min_y = -int(image_height * (bbox[0] - 0.5))
    min_x = int(image_width * (bbox[1] - 0.5))
    max_y = -int(image_height * (bbox[2] - 0.5))
    max_x = int(image_width * (bbox[3] - 0.5))
    coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],
                            [max_y, max_x]]).astype(np.float32)
    # Rotate the coordinates according to the rotation matrix clockwise if
    # radians is positive, else negative
    rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],
                                [-math.sin(radians), math.cos(radians)]])
    new_coords = np.matmul(rotation_matrix,
                           np.transpose(coordinates)).astype(np.int32)

    # Find min/max values and convert them back to normalized 0-1 floats.
    min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)
    min_x = float(np.min(new_coords[1, :])) / image_width + 0.5
    max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)
    max_x = float(np.max(new_coords[1, :])) / image_width + 0.5

    # Clip the bboxes to be sure the fall between [0, 1].
    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
    return np.stack([min_y, min_x, max_y, max_x])


def rotate_with_bboxes(image, bboxes, degrees, replace):
    # Rotate the image.
    image = rotate(image, degrees, replace)

    # Convert bbox coordinates to pixel values.
    image_height, image_width = image.shape[:2]
    # pylint:disable=g-long-lambda
    wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)
    # pylint:enable=g-long-lambda
    new_bboxes = np.zeros_like(bboxes)
    for idx in range(len(bboxes)):
        new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])
    return image, new_bboxes


def translate_x(image, pixels, replace):
    """Equivalent of PIL Translate in X dimension."""
    image = Image.fromarray(wrap(image))
    image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))
    return unwrap(np.array(image), replace)


def translate_y(image, pixels, replace):
    """Equivalent of PIL Translate in Y dimension."""
    image = Image.fromarray(wrap(image))
    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))
    return unwrap(np.array(image), replace)


def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
    """Shifts the bbox coordinates by pixels.

    Args:
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        image_height: Int, height of the image.
        image_width: Int, width of the image.
        pixels: An int. How many pixels to shift the bbox.
        shift_horizontal: Boolean. If true then shift in X dimension else shift in
            Y dimension.

    Returns:
        A tensor of the same shape as bbox, but now with the shifted coordinates.
    """
    pixels = int(pixels)
    # Convert bbox to integer pixel locations.
    min_y = int(float(image_height) * bbox[0])
    min_x = int(float(image_width) * bbox[1])
    max_y = int(float(image_height) * bbox[2])
    max_x = int(float(image_width) * bbox[3])

    if shift_horizontal:
        min_x = np.maximum(0, min_x - pixels)
        max_x = np.minimum(image_width, max_x - pixels)
    else:
        min_y = np.maximum(0, min_y - pixels)
        max_y = np.minimum(image_height, max_y - pixels)

    # Convert bbox back to floats.
    min_y = float(min_y) / float(image_height)
    min_x = float(min_x) / float(image_width)
    max_y = float(max_y) / float(image_height)
    max_x = float(max_x) / float(image_width)

    # Clip the bboxes to be sure the fall between [0, 1].
    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
    return np.stack([min_y, min_x, max_y, max_x])


def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
    """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.

    Args:
        image: 3D uint8 Tensor.
        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
            between [0, 1].
        pixels: An int. How many pixels to shift the image and bboxes
        replace: A one or three value 1D tensor to fill empty pixels.
        shift_horizontal: Boolean. If true then shift in X dimension else shift in
            Y dimension.

    Returns:
        A tuple containing a 3D uint8 Tensor that will be the result of translating
        image by pixels. The second element of the tuple is bboxes, where now
        the coordinates will be shifted to reflect the shifted image.
    """
    if shift_horizontal:
        image = translate_x(image, pixels, replace)
    else:
        image = translate_y(image, pixels, replace)

    # Convert bbox coordinates to pixel values.
    image_height, image_width = image.shape[0], image.shape[1]
    # pylint:disable=g-long-lambda
    wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)
    # pylint:enable=g-long-lambda
    new_bboxes = deepcopy(bboxes)
    num_bboxes = len(bboxes)
    for idx in range(num_bboxes):
        new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])
    return image.astype(np.uint8), new_bboxes


def shear_x(image, level, replace):
    """Equivalent of PIL Shearing in X dimension."""
    # Shear parallel to x axis is a projective transform
    # with a matrix form of:
    # [1    level
    #    0    1].
    image = Image.fromarray(wrap(image))
    image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))
    return unwrap(np.array(image), replace)


def shear_y(image, level, replace):
    """Equivalent of PIL Shearing in Y dimension."""
    # Shear parallel to y axis is a projective transform
    # with a matrix form of:
    # [1    0
    #    level    1].
    image = Image.fromarray(wrap(image))
    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))
    return unwrap(np.array(image), replace)


def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
    """Shifts the bbox according to how the image was sheared.

    Args:
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        image_height: Int, height of the image.
        image_width: Int, height of the image.
        level: Float. How much to shear the image.
        shear_horizontal: If true then shear in X dimension else shear in
            the Y dimension.

    Returns:
        A tensor of the same shape as bbox, but now with the shifted coordinates.
    """
    image_height, image_width = (float(image_height), float(image_width))

    # Change bbox coordinates to be pixels.
    min_y = int(image_height * bbox[0])
    min_x = int(image_width * bbox[1])
    max_y = int(image_height * bbox[2])
    max_x = int(image_width * bbox[3])
    coordinates = np.stack(
        [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
    coordinates = coordinates.astype(np.float32)

    # Shear the coordinates according to the translation matrix.
    if shear_horizontal:
        translation_matrix = np.stack([[1, 0], [-level, 1]])
    else:
        translation_matrix = np.stack([[1, -level], [0, 1]])
    translation_matrix = translation_matrix.astype(np.float32)
    new_coords = np.matmul(translation_matrix,
                           np.transpose(coordinates)).astype(np.int32)

    # Find min/max values and convert them back to floats.
    min_y = float(np.min(new_coords[0, :])) / image_height
    min_x = float(np.min(new_coords[1, :])) / image_width
    max_y = float(np.max(new_coords[0, :])) / image_height
    max_x = float(np.max(new_coords[1, :])) / image_width

    # Clip the bboxes to be sure the fall between [0, 1].
    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
    return np.stack([min_y, min_x, max_y, max_x])


def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
    """Applies Shear Transformation to the image and shifts the bboxes.

    Args:
        image: 3D uint8 Tensor.
        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
            between [0, 1].
        level: Float. How much to shear the image. This value will be between
            -0.3 to 0.3.
        replace: A one or three value 1D tensor to fill empty pixels.
        shear_horizontal: Boolean. If true then shear in X dimension else shear in
            the Y dimension.

    Returns:
        A tuple containing a 3D uint8 Tensor that will be the result of shearing
        image by level. The second element of the tuple is bboxes, where now
        the coordinates will be shifted to reflect the sheared image.
    """
    if shear_horizontal:
        image = shear_x(image, level, replace)
    else:
        image = shear_y(image, level, replace)

    # Convert bbox coordinates to pixel values.
    image_height, image_width = image.shape[:2]
    # pylint:disable=g-long-lambda
    wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)
    # pylint:enable=g-long-lambda
    new_bboxes = deepcopy(bboxes)
    num_bboxes = len(bboxes)
    for idx in range(num_bboxes):
        new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])
    return image.astype(np.uint8), new_bboxes


def autocontrast(image):
    """Implements Autocontrast function from PIL.

    Args:
        image: A 3D uint8 tensor.

    Returns:
        The image after it has had autocontrast applied to it and will be of type
        uint8.
    """

    def scale_channel(image):
        """Scale the 2D image using the autocontrast rule."""
        # A possibly cheaper version can be done using cumsum/unique_with_counts
        # over the histogram values, rather than iterating over the entire image.
        # to compute mins and maxes.
        lo = float(np.min(image))
        hi = float(np.max(image))

        # Scale the image, making the lowest value 0 and the highest value 255.
        def scale_values(im):
            scale = 255.0 / (hi - lo)
            offset = -lo * scale
            im = im.astype(np.float32) * scale + offset
            img = np.clip(im, a_min=0, a_max=255.0)
            return im.astype(np.uint8)

        result = scale_values(image) if hi > lo else image
        return result

    # Assumes RGB for now.    Scales each channel independently
    # and then stacks the result.
    s1 = scale_channel(image[:, :, 0])
    s2 = scale_channel(image[:, :, 1])
    s3 = scale_channel(image[:, :, 2])
    image = np.stack([s1, s2, s3], 2)
    return image


def sharpness(image, factor):
    """Implements Sharpness function from PIL."""
    orig_image = image
    image = image.astype(np.float32)
    # Make image 4D for conv operation.
    # SMOOTH PIL Kernel.
    kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.
    result = cv2.filter2D(image, -1, kernel).astype(np.uint8)

    # Blend the final result.
    return blend(result, orig_image, factor)


def equalize(image):
    """Implements Equalize function from PIL using."""

    def scale_channel(im, c):
        """Scale the data in the channel to implement equalize."""
        im = im[:, :, c].astype(np.int32)
        # Compute the histogram of the image channel.
        histo, _ = np.histogram(im, range=[0, 255], bins=256)

        # For the purposes of computing the step, filter out the nonzeros.
        nonzero = np.where(np.not_equal(histo, 0))
        nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])
        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255

        def build_lut(histo, step):
            # Compute the cumulative sum, shifting by step // 2
            # and then normalization by step.
            lut = (np.cumsum(histo) + (step // 2)) // step
            # Shift lut, prepending with 0.
            lut = np.concatenate([[0], lut[:-1]], 0)
            # Clip the counts to be in range.    This is done
            # in the C code for image.point.
            return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)

        # If step is zero, return the original image.    Otherwise, build
        # lut from the full histogram and step and then index from it.
        if step == 0:
            result = im
        else:
            result = np.take(build_lut(histo, step), im)

        return result.astype(np.uint8)

    # Assumes RGB for now.    Scales each channel independently
    # and then stacks the result.
    s1 = scale_channel(image, 0)
    s2 = scale_channel(image, 1)
    s3 = scale_channel(image, 2)
    image = np.stack([s1, s2, s3], 2)
    return image


def wrap(image):
    """Returns 'image' with an extra channel set to all 1s."""
    shape = image.shape
    extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)
    extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)
    return extended


def unwrap(image, replace):
    """Unwraps an image produced by wrap.

    Where there is a 0 in the last channel for every spatial position,
    the rest of the three channels in that spatial dimension are grayed
    (set to 128).    Operations like translate and shear on a wrapped
    Tensor will leave 0s in empty locations.    Some transformations look
    at the intensity of values to do preprocessing, and we want these
    empty pixels to assume the 'average' value, rather than pure black.


    Args:
        image: A 3D Image Tensor with 4 channels.
        replace: A one or three value 1D tensor to fill empty pixels.

    Returns:
        image: A 3D image Tensor with 3 channels.
    """
    image_shape = image.shape
    # Flatten the spatial dimensions.
    flattened_image = np.reshape(image, [-1, image_shape[2]])

    # Find all pixels where the last channel is zero.
    alpha_channel = flattened_image[:, 3]

    replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)

    # Where they are zero, fill them in with 'replace'.
    alpha_channel = np.reshape(alpha_channel, (-1, 1))
    alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))

    flattened_image = np.where(
        np.equal(alpha_channel, 0),
        np.ones_like(
            flattened_image, dtype=image.dtype) * replace,
        flattened_image)

    image = np.reshape(flattened_image, image_shape)
    image = image[:, :, :3]
    return image.astype(np.uint8)


def _cutout_inside_bbox(image, bbox, pad_fraction):
    """Generates cutout mask and the mean pixel value of the bbox.

    First a location is randomly chosen within the image as the center where the
    cutout mask will be applied. Note this can be towards the boundaries of the
    image, so the full cutout mask may not be applied.

    Args:
        image: 3D uint8 Tensor.
        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
            of type float that represents the normalized coordinates between 0 and 1.
        pad_fraction: Float that specifies how large the cutout mask should be in
            in reference to the size of the original bbox. If pad_fraction is 0.25,
            then the cutout mask will be of shape
            (0.25 * bbox height, 0.25 * bbox width).

    Returns:
        A tuple. Fist element is a tensor of the same shape as image where each
        element is either a 1 or 0 that is used to determine where the image
        will have cutout applied. The second element is the mean of the pixels
        in the image where the bbox is located.
        mask value: [0,1]
    """
    image_height, image_width = image.shape[0], image.shape[1]
    # Transform from shape [1, 4] to [4].
    bbox = np.squeeze(bbox)

    min_y = int(float(image_height) * bbox[0])
    min_x = int(float(image_width) * bbox[1])
    max_y = int(float(image_height) * bbox[2])
    max_x = int(float(image_width) * bbox[3])

    # Calculate the mean pixel values in the bounding box, which will be used
    # to fill the cutout region.
    mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))
    # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the
    # region lies entirely within the bbox.
    box_height = max_y - min_y + 1
    box_width = max_x - min_x + 1
    pad_size_height = int(pad_fraction * (box_height / 2))
    pad_size_width = int(pad_fraction * (box_width / 2))

    # Sample the center location in the image where the zero mask will be applied.
    cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)
    cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)

    lower_pad = np.maximum(0, cutout_center_height - pad_size_height)
    upper_pad = np.maximum(
        0, image_height - cutout_center_height - pad_size_height)
    left_pad = np.maximum(0, cutout_center_width - pad_size_width)
    right_pad = np.maximum(0,
                           image_width - cutout_center_width - pad_size_width)

    cutout_shape = [
        image_height - (lower_pad + upper_pad),
        image_width - (left_pad + right_pad)
    ]
    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]

    mask = np.pad(np.zeros(
        cutout_shape, dtype=image.dtype),
                  padding_dims,
                  'constant',
                  constant_values=1)

    mask = np.expand_dims(mask, 2)
    mask = np.tile(mask, [1, 1, 3])
    return mask, mean


def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):
    """Applies cutout to the image according to bbox information.

    This is a cutout variant that using bbox information to make more informed
    decisions on where to place the cutout mask.

    Args:
        image: 3D uint8 Tensor.
        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
            between [0, 1].
        pad_fraction: Float that specifies how large the cutout mask should be in
            in reference to the size of the original bbox. If pad_fraction is 0.25,
            then the cutout mask will be of shape
            (0.25 * bbox height, 0.25 * bbox width).
        replace_with_mean: Boolean that specified what value should be filled in
            where the cutout mask is applied. Since the incoming image will be of
            uint8 and will not have had any mean normalization applied, by default
            we set the value to be 128. If replace_with_mean is True then we find
            the mean pixel values across the channel dimension and use those to fill
            in where the cutout mask is applied.

    Returns:
        A tuple. First element is a tensor of the same shape as image that has
        cutout applied to it. Second element is the bboxes that were passed in
        that will be unchanged.
    """

    def apply_bbox_cutout(image, bboxes, pad_fraction):
        """Applies cutout to a single bounding box within image."""
        # Choose a single bounding box to apply cutout to.
        random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)
        # Select the corresponding bbox and apply cutout.
        chosen_bbox = np.take(bboxes, random_index, axis=0)
        mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)

        # When applying cutout we either set the pixel value to 128 or to the mean
        # value inside the bbox.
        replace = mean if replace_with_mean else [128] * 3

        # Apply the cutout mask to the image. Where the mask is 0 we fill it with
        # `replace`.
        image = np.where(
            np.equal(mask, 0),
            np.ones_like(
                image, dtype=image.dtype) * replace,
            image).astype(image.dtype)
        return image

    # Check to see if there are boxes, if so then apply boxcutout.
    if len(bboxes) != 0:
        image = apply_bbox_cutout(image, bboxes, pad_fraction)

    return image, bboxes


NAME_TO_FUNC = {
        'AutoContrast': autocontrast,
        'Equalize': equalize,
        'Posterize': posterize,
        'Solarize': solarize,
        'SolarizeAdd': solarize_add,
        'Color': color,
        'Contrast': contrast,
        'Brightness': brightness,
        'Sharpness': sharpness,
        'Cutout': cutout,
        'BBox_Cutout': bbox_cutout,
        'Rotate_BBox': rotate_with_bboxes,
        # pylint:disable=g-long-lambda
        'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
                image, bboxes, pixels, replace, shift_horizontal=True),
        'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
                image, bboxes, pixels, replace, shift_horizontal=False),
        'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
                image, bboxes, level, replace, shear_horizontal=True),
        'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
                image, bboxes, level, replace, shear_horizontal=False),
        # pylint:enable=g-long-lambda
        'Rotate_Only_BBoxes': rotate_only_bboxes,
        'ShearX_Only_BBoxes': shear_x_only_bboxes,
        'ShearY_Only_BBoxes': shear_y_only_bboxes,
        'TranslateX_Only_BBoxes': translate_x_only_bboxes,
        'TranslateY_Only_BBoxes': translate_y_only_bboxes,
        'Flip_Only_BBoxes': flip_only_bboxes,
        'Solarize_Only_BBoxes': solarize_only_bboxes,
        'Equalize_Only_BBoxes': equalize_only_bboxes,
        'Cutout_Only_BBoxes': cutout_only_bboxes,
}


def _randomly_negate_tensor(tensor):
    """With 50% prob turn the tensor negative."""
    should_flip = np.floor(np.random.rand() + 0.5) >= 1
    final_tensor = tensor if should_flip else -tensor
    return final_tensor


def _rotate_level_to_arg(level):
    level = (level / _MAX_LEVEL) * 30.
    level = _randomly_negate_tensor(level)
    return (level, )


def _shrink_level_to_arg(level):
    """Converts level to ratio by which we shrink the image content."""
    if level == 0:
        return (1.0, )  # if level is zero, do not shrink the image
    # Maximum shrinking ratio is 2.9.
    level = 2. / (_MAX_LEVEL / level) + 0.9
    return (level, )


def _enhance_level_to_arg(level):
    return ((level / _MAX_LEVEL) * 1.8 + 0.1, )


def _shear_level_to_arg(level):
    level = (level / _MAX_LEVEL) * 0.3
    # Flip level to negative with 50% chance.
    level = _randomly_negate_tensor(level)
    return (level, )


def _translate_level_to_arg(level, translate_const):
    level = (level / _MAX_LEVEL) * float(translate_const)
    # Flip level to negative with 50% chance.
    level = _randomly_negate_tensor(level)
    return (level, )


def _bbox_cutout_level_to_arg(level, hparams):
    cutout_pad_fraction = (level /
                           _MAX_LEVEL) * 0.75  # hparams.cutout_max_pad_fraction
    return (cutout_pad_fraction, False)  # hparams.cutout_bbox_replace_with_mean


def level_to_arg(hparams):
    return {
        'AutoContrast': lambda level: (),
        'Equalize': lambda level: (),
        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),
        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),
        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),
        'Color': _enhance_level_to_arg,
        'Contrast': _enhance_level_to_arg,
        'Brightness': _enhance_level_to_arg,
        'Sharpness': _enhance_level_to_arg,
        'Cutout':
        lambda level: (int((level / _MAX_LEVEL) * 100), ),  # hparams.cutout_const=100
        # pylint:disable=g-long-lambda
        'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),
        'TranslateX_BBox':
        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_const=250
        'TranslateY_BBox':
        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_cons
        # pylint:enable=g-long-lambda
        'ShearX_BBox': _shear_level_to_arg,
        'ShearY_BBox': _shear_level_to_arg,
        'Rotate_BBox': _rotate_level_to_arg,
        'Rotate_Only_BBoxes': _rotate_level_to_arg,
        'ShearX_Only_BBoxes': _shear_level_to_arg,
        'ShearY_Only_BBoxes': _shear_level_to_arg,
        # pylint:disable=g-long-lambda
        'TranslateX_Only_BBoxes':
        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
        'TranslateY_Only_BBoxes':
        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
        # pylint:enable=g-long-lambda
        'Flip_Only_BBoxes': lambda level: (),
        'Solarize_Only_BBoxes':
        lambda level: (int((level / _MAX_LEVEL) * 256), ),
        'Equalize_Only_BBoxes': lambda level: (),
        # pylint:disable=g-long-lambda
        'Cutout_Only_BBoxes':
        lambda level: (int((level / _MAX_LEVEL) * 50), ),  # hparams.cutout_bbox_const
        # pylint:enable=g-long-lambda
    }


def bbox_wrapper(func):
    """Adds a bboxes function argument to func and returns unchanged bboxes."""

    def wrapper(images, bboxes, *args, **kwargs):
        return (func(images, *args, **kwargs), bboxes)

    return wrapper


def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
    """Return the function that corresponds to `name` and update `level` param."""
    func = NAME_TO_FUNC[name]
    args = level_to_arg(augmentation_hparams)[name](level)

    # Check to see if prob is passed into function. This is used for operations
    # where we alter bboxes independently.
    # pytype:disable=wrong-arg-types
    if 'prob' in inspect.getfullargspec(func)[0]:
        args = tuple([prob] + list(args))
    # pytype:enable=wrong-arg-types

    # Add in replace arg if it is required for the function that is being called.
    if 'replace' in inspect.getfullargspec(func)[0]:
        # Make sure replace is the final argument
        assert 'replace' == inspect.getfullargspec(func)[0][-1]
        args = tuple(list(args) + [replace_value])

    # Add bboxes as the second positional argument for the function if it does
    # not already exist.
    if 'bboxes' not in inspect.getfullargspec(func)[0]:
        func = bbox_wrapper(func)
    return (func, prob, args)


def _apply_func_with_prob(func, image, args, prob, bboxes):
    """Apply `func` to image w/ `args` as input with probability `prob`."""
    assert isinstance(args, tuple)
    assert 'bboxes' == inspect.getfullargspec(func)[0][1]

    # If prob is a function argument, then this randomness is being handled
    # inside the function, so make sure it is always called.
    if 'prob' in inspect.getfullargspec(func)[0]:
        prob = 1.0

    # Apply the function with probability `prob`.
    should_apply_op = np.floor(np.random.rand() + 0.5) >= 1
    if should_apply_op:
        augmented_image, augmented_bboxes = func(image, bboxes, *args)
    else:
        augmented_image, augmented_bboxes = (image, bboxes)
    return augmented_image, augmented_bboxes


def select_and_apply_random_policy(policies, image, bboxes):
    """Select a random policy from `policies` and apply it to `image`."""
    policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)
    # policy_to_select = 6 # for test
    for (i, policy) in enumerate(policies):
        if i == policy_to_select:
            image, bboxes = policy(image, bboxes)
    return (image, bboxes)


def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):
    """Build a policy from the given policies passed in and apply to image.

    Args:
        policies: list of lists of tuples in the form `(func, prob, level)`, `func`
            is a string name of the augmentation function, `prob` is the probability
            of applying the `func` operation, `level` is the input argument for
            `func`.
        image: numpy array that the resulting policy will be applied to.
        bboxes:
        augmentation_hparams: Hparams associated with the NAS learned policy.

    Returns:
        A version of image that now has data augmentation applied to it based on
        the `policies` pass into the function. Additionally, returns bboxes if
        a value for them is passed in that is not None
    """
    replace_value = [128, 128, 128]

    # func is the string name of the augmentation function, prob is the
    # probability of applying the operation and level is the parameter associated

    # tf_policies are functions that take in an image and return an augmented
    # image.
    tf_policies = []
    for policy in policies:
        tf_policy = []
        # Link string name to the correct python function and make sure the correct
        # argument is passed into that function.
        for policy_info in policy:
            policy_info = list(
                policy_info) + [replace_value, augmentation_hparams]

            tf_policy.append(_parse_policy_info(*policy_info))
        # Now build the tf policy that will apply the augmentation procedue
        # on image.
        def make_final_policy(tf_policy_):
            def final_policy(image_, bboxes_):
                for func, prob, args in tf_policy_:
                    image_, bboxes_ = _apply_func_with_prob(func, image_, args,
                                                            prob, bboxes_)
                return image_, bboxes_

            return final_policy

        tf_policies.append(make_final_policy(tf_policy))

    augmented_images, augmented_bboxes = select_and_apply_random_policy(
        tf_policies, image, bboxes)
    # If no bounding boxes were specified, then just return the images.
    return (augmented_images, augmented_bboxes)


# TODO(barretzoph): Add in ArXiv link once paper is out.
def distort_image_with_autoaugment(image, bboxes, augmentation_name):
    """Applies the AutoAugment policy to `image` and `bboxes`.

    Args:
        image: `Tensor` of shape [height, width, 3] representing an image.
        bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are
            normalized between [0, 1].
        augmentation_name: The name of the AutoAugment policy to use. The available
            options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for
            all of the results in the paper and was found to achieve the best results
            on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
            found on the COCO dataset that have slight variation in what operations
            were used during the search procedure along with how many operations are
            applied in parallel to a single image (2 vs 3).

    Returns:
        A tuple containing the augmented versions of `image` and `bboxes`.
    """
    available_policies = {
        'v0': policy_v0,
        'v1': policy_v1,
        'v2': policy_v2,
        'v3': policy_v3,
        'test': policy_vtest
    }
    if augmentation_name not in available_policies:
        raise ValueError('Invalid augmentation_name: {}'.format(
            augmentation_name))

    policy = available_policies[augmentation_name]()
    augmentation_hparams = {}
    return build_and_apply_nas_policy(policy, image, bboxes,
                                      augmentation_hparams)


================================================
FILE: ppdet/data/transform/batch_operators.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import typing

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

import cv2
import copy
import math
import numpy as np
from .operators import register_op, BaseOperator, Resize
from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian
from .atss_assigner import ATSSAssigner
from scipy import ndimage

from ppdet.modeling import bbox_utils
from ppdet.utils.logger import setup_logger
from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
logger = setup_logger(__name__)

__all__ = [
    'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget',
    'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch',
    'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT',
    'PadRGT', 'BatchRandomResizeForSSOD'
]


@register_op
class PadBatch(BaseOperator):
    """
    Pad a batch of samples so they can be divisible by a stride.
    The layout of each image should be 'CHW'.
    Args:
        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
            height and width is divisible by `pad_to_stride`.
    """

    def __init__(self, pad_to_stride=0):
        super(PadBatch, self).__init__()
        self.pad_to_stride = pad_to_stride

    def __call__(self, samples, context=None):
        """
        Args:
            samples (list): a batch of sample, each is dict.
        """
        coarsest_stride = self.pad_to_stride

        # multi scale input is nested list
        if isinstance(samples,
                      typing.Sequence) and len(samples) > 0 and isinstance(
                          samples[0], typing.Sequence):
            inner_samples = samples[0]
        else:
            inner_samples = samples

        max_shape = np.array(
            [data['image'].shape for data in inner_samples]).max(axis=0)
        if coarsest_stride > 0:
            max_shape[1] = int(
                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
            max_shape[2] = int(
                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)

        for data in inner_samples:
            im = data['image']
            im_c, im_h, im_w = im.shape[:]
            padding_im = np.zeros(
                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
            padding_im[:, :im_h, :im_w] = im
            data['image'] = padding_im
            if 'semantic' in data and data['semantic'] is not None:
                semantic = data['semantic']
                padding_sem = np.zeros(
                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
                padding_sem[:, :im_h, :im_w] = semantic
                data['semantic'] = padding_sem
            if 'gt_segm' in data and data['gt_segm'] is not None:
                gt_segm = data['gt_segm']
                padding_segm = np.zeros(
                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
                    dtype=np.uint8)
                padding_segm[:, :im_h, :im_w] = gt_segm
                data['gt_segm'] = padding_segm

        return samples


@register_op
class BatchRandomResize(BaseOperator):
    """
    Resize image to target size randomly. random target_size and interpolation method
    Args:
        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
        keep_ratio (bool): whether keep_raio or not, default true
        interp (int): the interpolation method
        random_size (bool): whether random select target size of image
        random_interp (bool): whether random select interpolation method
    """

    def __init__(self,
                 target_size,
                 keep_ratio,
                 interp=cv2.INTER_NEAREST,
                 random_size=True,
                 random_interp=False):
        super(BatchRandomResize, self).__init__()
        self.keep_ratio = keep_ratio
        self.interps = [
            cv2.INTER_NEAREST,
            cv2.INTER_LINEAR,
            cv2.INTER_AREA,
            cv2.INTER_CUBIC,
            cv2.INTER_LANCZOS4,
        ]
        self.interp = interp
        assert isinstance(target_size, (
            int, Sequence)), "target_size must be int, list or tuple"
        if random_size and not isinstance(target_size, list):
            raise TypeError(
                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
                format(type(target_size)))
        self.target_size = target_size
        self.random_size = random_size
        self.random_interp = random_interp

    def __call__(self, samples, context=None):
        if self.random_size:
            index = np.random.choice(len(self.target_size))
            target_size = self.target_size[index]
        else:
            target_size = self.target_size

        if self.random_interp:
            interp = np.random.choice(self.interps)
        else:
            interp = self.interp

        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
        return resizer(samples, context=context)


@register_op
class Gt2YoloTarget(BaseOperator):
    __shared__ = ['num_classes']
    """
    Generate YOLOv3 targets by groud truth data, this operator is only used in
    fine grained YOLOv3 loss mode
    """

    def __init__(self,
                 anchors,
                 anchor_masks,
                 downsample_ratios,
                 num_classes=80,
                 iou_thresh=1.):
        super(Gt2YoloTarget, self).__init__()
        self.anchors = anchors
        self.anchor_masks = anchor_masks
        self.downsample_ratios = downsample_ratios
        self.num_classes = num_classes
        self.iou_thresh = iou_thresh

    def __call__(self, samples, context=None):
        assert len(self.anchor_masks) == len(self.downsample_ratios), \
            "anchor_masks', and 'downsample_ratios' should have same length."

        h, w = samples[0]['image'].shape[1:3]
        an_hw = np.array(self.anchors) / np.array([[w, h]])
        for sample in samples:
            gt_bbox = sample['gt_bbox']
            gt_class = sample['gt_class']
            if 'gt_score' not in sample:
                sample['gt_score'] = np.ones(
                    (gt_bbox.shape[0], 1), dtype=np.float32)
            gt_score = sample['gt_score']
            for i, (
                    mask, downsample_ratio
            ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):
                grid_h = int(h / downsample_ratio)
                grid_w = int(w / downsample_ratio)
                target = np.zeros(
                    (len(mask), 6 + self.num_classes, grid_h, grid_w),
                    dtype=np.float32)
                for b in range(gt_bbox.shape[0]):
                    gx, gy, gw, gh = gt_bbox[b, :]
                    cls = gt_class[b]
                    score = gt_score[b]
                    if gw <= 0. or gh <= 0. or score <= 0.:
                        continue

                    # find best match anchor index
                    best_iou = 0.
                    best_idx = -1
                    for an_idx in range(an_hw.shape[0]):
                        iou = jaccard_overlap(
                            [0., 0., gw, gh],
                            [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
                        if iou > best_iou:
                            best_iou = iou
                            best_idx = an_idx

                    gi = int(gx * grid_w)
                    gj = int(gy * grid_h)

                    # gtbox should be regresed in this layes if best match 
                    # anchor index in anchor mask of this layer
                    if best_idx in mask:
                        best_n = mask.index(best_idx)

                        # x, y, w, h, scale
                        target[best_n, 0, gj, gi] = gx * grid_w - gi
                        target[best_n, 1, gj, gi] = gy * grid_h - gj
                        target[best_n, 2, gj, gi] = np.log(
                            gw * w / self.anchors[best_idx][0])
                        target[best_n, 3, gj, gi] = np.log(
                            gh * h / self.anchors[best_idx][1])
                        target[best_n, 4, gj, gi] = 2.0 - gw * gh

                        # objectness record gt_score
                        target[best_n, 5, gj, gi] = score

                        # classification
                        target[best_n, 6 + cls, gj, gi] = 1.

                    # For non-matched anchors, calculate the target if the iou 
                    # between anchor and gt is larger than iou_thresh
                    if self.iou_thresh < 1:
                        for idx, mask_i in enumerate(mask):
                            if mask_i == best_idx: continue
                            iou = jaccard_overlap(
                                [0., 0., gw, gh],
                                [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])
                            if iou > self.iou_thresh and target[idx, 5, gj,
                                                                gi] == 0.:
                                # x, y, w, h, scale
                                target[idx, 0, gj, gi] = gx * grid_w - gi
                                target[idx, 1, gj, gi] = gy * grid_h - gj
                                target[idx, 2, gj, gi] = np.log(
                                    gw * w / self.anchors[mask_i][0])
                                target[idx, 3, gj, gi] = np.log(
                                    gh * h / self.anchors[mask_i][1])
                                target[idx, 4, gj, gi] = 2.0 - gw * gh

                                # objectness record gt_score
                                target[idx, 5, gj, gi] = score

                                # classification
                                target[idx, 6 + cls, gj, gi] = 1.
                sample['target{}'.format(i)] = target

            # remove useless gt_class and gt_score after target calculated
            sample.pop('gt_class')
            sample.pop('gt_score')

        return samples


@register_op
class Gt2FCOSTarget(BaseOperator):
    """
    Generate FCOS targets by groud truth data
    """

    def __init__(self,
                 object_sizes_boundary,
                 center_sampling_radius,
                 downsample_ratios,
                 num_shift=0.5,
                 multiply_strides_reg_targets=False,
                 norm_reg_targets=True):
        super(Gt2FCOSTarget, self).__init__()
        self.center_sampling_radius = center_sampling_radius
        self.downsample_ratios = downsample_ratios
        self.INF = np.inf
        self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF]
        object_sizes_of_interest = []
        for i in range(len(self.object_sizes_boundary) - 1):
            object_sizes_of_interest.append([
                self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
            ])
        self.object_sizes_of_interest = object_sizes_of_interest
        self.num_shift = num_shift
        self.multiply_strides_reg_targets = multiply_strides_reg_targets
        self.norm_reg_targets = norm_reg_targets

    def _compute_points(self, w, h):
        """
        compute the corresponding points in each feature map
        :param h: image height
        :param w: image width
        :return: points from all feature map
        """
        locations = []
        for stride in self.downsample_ratios:
            shift_x = np.arange(0, w, stride).astype(np.float32)
            shift_y = np.arange(0, h, stride).astype(np.float32)
            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
            shift_x = shift_x.flatten()
            shift_y = shift_y.flatten()
            location = np.stack(
                [shift_x, shift_y], axis=1) + stride * self.num_shift
            locations.append(location)
        num_points_each_level = [len(location) for location in locations]
        locations = np.concatenate(locations, axis=0)
        return locations, num_points_each_level

    def _convert_xywh2xyxy(self, gt_bbox, w, h):
        """
        convert the bounding box from style xywh to xyxy
        :param gt_bbox: bounding boxes normalized into [0, 1]
        :param w: image width
        :param h: image height
        :return: bounding boxes in xyxy style
        """
        bboxes = gt_bbox.copy()
        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w
        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h
        bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
        bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
        return bboxes

    def _check_inside_boxes_limited(self, gt_bbox, xs, ys,
                                    num_points_each_level):
        """
        check if points is within the clipped boxes
        :param gt_bbox: bounding boxes
        :param xs: horizontal coordinate of points
        :param ys: vertical coordinate of points
        :return: the mask of points is within gt_box or not
        """
        bboxes = np.reshape(
            gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]])
        bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1])
        ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2
        ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2
        beg = 0
        clipped_box = bboxes.copy()
        for lvl, stride in enumerate(self.downsample_ratios):
            end = beg + num_points_each_level[lvl]
            stride_exp = self.center_sampling_radius * stride
            clipped_box[beg:end, :, 0] = np.maximum(
                bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp)
            clipped_box[beg:end, :, 1] = np.maximum(
                bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp)
            clipped_box[beg:end, :, 2] = np.minimum(
                bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp)
            clipped_box[beg:end, :, 3] = np.minimum(
                bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp)
            beg = end
        l_res = xs - clipped_box[:, :, 0]
        r_res = clipped_box[:, :, 2] - xs
        t_res = ys - clipped_box[:, :, 1]
        b_res = clipped_box[:, :, 3] - ys
        clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
        inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0
        return inside_gt_box

    def __call__(self, samples, context=None):
        assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \
            "object_sizes_of_interest', and 'downsample_ratios' should have same length."

        for sample in samples:
            im = sample['image']
            bboxes = sample['gt_bbox']
            gt_class = sample['gt_class']
            # calculate the locations
            h, w = im.shape[1:3]
            points, num_points_each_level = self._compute_points(w, h)
            object_scale_exp = []
            for i, num_pts in enumerate(num_points_each_level):
                object_scale_exp.append(
                    np.tile(
                        np.array([self.object_sizes_of_interest[i]]),
                        reps=[num_pts, 1]))
            object_scale_exp = np.concatenate(object_scale_exp, axis=0)

            gt_area = (bboxes[:, 2] - bboxes[:, 0]) * (
                bboxes[:, 3] - bboxes[:, 1])
            xs, ys = points[:, 0], points[:, 1]
            xs = np.reshape(xs, newshape=[xs.shape[0], 1])
            xs = np.tile(xs, reps=[1, bboxes.shape[0]])
            ys = np.reshape(ys, newshape=[ys.shape[0], 1])
            ys = np.tile(ys, reps=[1, bboxes.shape[0]])

            l_res = xs - bboxes[:, 0]
            r_res = bboxes[:, 2] - xs
            t_res = ys - bboxes[:, 1]
            b_res = bboxes[:, 3] - ys
            reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
            if self.center_sampling_radius > 0:
                is_inside_box = self._check_inside_boxes_limited(
                    bboxes, xs, ys, num_points_each_level)
            else:
                is_inside_box = np.min(reg_targets, axis=2) > 0
            # check if the targets is inside the corresponding level
            max_reg_targets = np.max(reg_targets, axis=2)
            lower_bound = np.tile(
                np.expand_dims(
                    object_scale_exp[:, 0], axis=1),
                reps=[1, max_reg_targets.shape[1]])
            high_bound = np.tile(
                np.expand_dims(
                    object_scale_exp[:, 1], axis=1),
                reps=[1, max_reg_targets.shape[1]])
            is_match_current_level = \
                (max_reg_targets > lower_bound) & \
                (max_reg_targets < high_bound)
            points2gtarea = np.tile(
                np.expand_dims(
                    gt_area, axis=0), reps=[xs.shape[0], 1])
            points2gtarea[is_inside_box == 0] = self.INF
            points2gtarea[is_match_current_level == 0] = self.INF
            points2min_area = points2gtarea.min(axis=1)
            points2min_area_ind = points2gtarea.argmin(axis=1)
            labels = gt_class[points2min_area_ind] + 1
            labels[points2min_area == self.INF] = 0
            reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind]
            ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \
                                  reg_targets[:, [0, 2]].max(axis=1)) * \
                                  (reg_targets[:, [1, 3]].min(axis=1) / \
                                   reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32)
            ctn_targets = np.reshape(
                ctn_targets, newshape=[ctn_targets.shape[0], 1])
            ctn_targets[labels <= 0] = 0
            pos_ind = np.nonzero(labels != 0)
            reg_targets_pos = reg_targets[pos_ind[0], :]
            split_sections = []
            beg = 0
            for lvl in range(len(num_points_each_level)):
                end = beg + num_points_each_level[lvl]
                split_sections.append(end)
                beg = end
            labels_by_level = np.split(labels, split_sections, axis=0)
            reg_targets_by_level = np.split(reg_targets, split_sections, axis=0)
            ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0)
            for lvl in range(len(self.downsample_ratios)):
                grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
                grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
                if self.norm_reg_targets:
                    if self.multiply_strides_reg_targets:
                        sample['reg_target{}'.format(lvl)] = np.reshape(
                            reg_targets_by_level[lvl],
                            newshape=[grid_h, grid_w, 4])
                    else:
                        sample['reg_target{}'.format(lvl)] = \
                            np.reshape(
                                reg_targets_by_level[lvl] / \
                                self.downsample_ratios[lvl],
                                newshape=[grid_h, grid_w, 4])
                else:
                    sample['reg_target{}'.format(lvl)] = np.reshape(
                        reg_targets_by_level[lvl],
                        newshape=[grid_h, grid_w, 4])
                sample['labels{}'.format(lvl)] = np.reshape(
                    labels_by_level[lvl], newshape=[grid_h, grid_w, 1])
                sample['centerness{}'.format(lvl)] = np.reshape(
                    ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1])

            sample.pop('is_crowd', None)
            sample.pop('difficult', None)
            sample.pop('gt_class', None)
            sample.pop('gt_bbox', None)
        return samples


@register_op
class Gt2GFLTarget(BaseOperator):
    __shared__ = ['num_classes']
    """
    Generate GFocal loss targets by groud truth data
    """

    def __init__(self,
                 num_classes=80,
                 downsample_ratios=[8, 16, 32, 64, 128],
                 grid_cell_scale=4,
                 cell_offset=0,
                 compute_vlr_region=False):
        super(Gt2GFLTarget, self).__init__()
        self.num_classes = num_classes
        self.downsample_ratios = downsample_ratios
        self.grid_cell_scale = grid_cell_scale
        self.cell_offset = cell_offset
        self.compute_vlr_region = compute_vlr_region

        self.assigner = ATSSAssigner()

    def get_grid_cells(self, featmap_size, scale, stride, offset=0):
        """
        Generate grid cells of a feature map for target assignment.
        Args:
            featmap_size: Size of a single level feature map.
            scale: Grid cell scale.
            stride: Down sample stride of the feature map.
            offset: Offset of grid cells.
        return:
            Grid_cells xyxy position. Size should be [feat_w * feat_h, 4]
        """
        cell_size = stride * scale
        h, w = featmap_size
        x_range = (np.arange(w, dtype=np.float32) + offset) * stride
        y_range = (np.arange(h, dtype=np.float32) + offset) * stride
        x, y = np.meshgrid(x_range, y_range)
        y = y.flatten()
        x = x.flatten()
        grid_cells = np.stack(
            [
                x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size,
                y + 0.5 * cell_size
            ],
            axis=-1)
        return grid_cells

    def get_sample(self, assign_gt_inds, gt_bboxes):
        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1

        if gt_bboxes.size == 0:
            # hack for index error case
            assert pos_assigned_gt_inds.size == 0
            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
        else:
            if len(gt_bboxes.shape) < 2:
                gt_bboxes = gt_bboxes.resize(-1, 4)
            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

    def __call__(self, samples, context=None):
        assert len(samples) > 0
        batch_size = len(samples)
        # get grid cells of image
        h, w = samples[0]['image'].shape[1:3]
        multi_level_grid_cells = []
        for stride in self.downsample_ratios:
            featmap_size = (int(math.ceil(h / stride)),
                            int(math.ceil(w / stride)))
            multi_level_grid_cells.append(
                self.get_grid_cells(featmap_size, self.grid_cell_scale, stride,
                                    self.cell_offset))
        mlvl_grid_cells_list = [
            multi_level_grid_cells for i in range(batch_size)
        ]
        # pixel cell number of multi-level feature maps
        num_level_cells = [
            grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0]
        ]
        num_level_cells_list = [num_level_cells] * batch_size
        # concat all level cells and to a single array
        for i in range(batch_size):
            mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i])
        # target assign on all images
        for sample, grid_cells, num_level_cells in zip(
                samples, mlvl_grid_cells_list, num_level_cells_list):
            gt_bboxes = sample['gt_bbox']
            gt_labels = sample['gt_class'].squeeze()
            if gt_labels.size == 1:
                gt_labels = np.array([gt_labels]).astype(np.int32)
            gt_bboxes_ignore = None
            assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
                                              gt_bboxes, gt_bboxes_ignore,
                                              gt_labels)

            if self.compute_vlr_region:
                vlr_region = self.assigner.get_vlr_region(
                    grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,
                    gt_labels)
                sample['vlr_regions'] = vlr_region

            pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
                assign_gt_inds, gt_bboxes)

            num_cells = grid_cells.shape[0]
            bbox_targets = np.zeros_like(grid_cells)
            bbox_weights = np.zeros_like(grid_cells)
            labels = np.ones([num_cells], dtype=np.int64) * self.num_classes
            label_weights = np.zeros([num_cells], dtype=np.float32)

            if len(pos_inds) > 0:
                pos_bbox_targets = pos_gt_bboxes
                bbox_targets[pos_inds, :] = pos_bbox_targets
                bbox_weights[pos_inds, :] = 1.0
                if not np.any(gt_labels):
                    labels[pos_inds] = 0
                else:
                    labels[pos_inds] = gt_labels[pos_assigned_gt_inds]

                label_weights[pos_inds] = 1.0
            if len(neg_inds) > 0:
                label_weights[neg_inds] = 1.0
            sample['grid_cells'] = grid_cells
            sample['labels'] = labels
            sample['label_weights'] = label_weights
            sample['bbox_targets'] = bbox_targets
            sample['pos_num'] = max(pos_inds.size, 1)
            sample.pop('is_crowd', None)
            sample.pop('difficult', None)
            sample.pop('gt_class', None)
            sample.pop('gt_bbox', None)
            sample.pop('gt_score', None)
        return samples


@register_op
class Gt2TTFTarget(BaseOperator):
    __shared__ = ['num_classes']
    """
    Gt2TTFTarget
    Generate TTFNet targets by ground truth data
    
    Args:
        num_classes(int): the number of classes.
        down_ratio(int): the down ratio from images to heatmap, 4 by default.
        alpha(float): the alpha parameter to generate gaussian target.
            0.54 by default.
    """

    def __init__(self, num_classes=80, down_ratio=4, alpha=0.54):
        super(Gt2TTFTarget, self).__init__()
        self.down_ratio = down_ratio
        self.num_classes = num_classes
        self.alpha = alpha

    def __call__(self, samples, context=None):
        output_size = samples[0]['image'].shape[1]
        feat_size = output_size // self.down_ratio
        for sample in samples:
            heatmap = np.zeros(
                (self.num_classes, feat_size, feat_size), dtype='float32')
            box_target = np.ones(
                (4, feat_size, feat_size), dtype='float32') * -1
            reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32')

            gt_bbox = sample['gt_bbox']
            gt_class = sample['gt_class']

            bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1
            bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1
            area = bbox_w * bbox_h
            boxes_areas_log = np.log(area)
            boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1]
            boxes_area_topk_log = boxes_areas_log[boxes_ind]
            gt_bbox = gt_bbox[boxes_ind]
            gt_class = gt_class[boxes_ind]

            feat_gt_bbox = gt_bbox / self.down_ratio
            feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1)
            feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1],
                                feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0])

            ct_inds = np.stack(
                [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2,
                 (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2],
                axis=1) / self.down_ratio

            h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32')
            w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32')

            for k in range(len(gt_bbox)):
                cls_id = gt_class[k]
                fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32')
                self.draw_truncate_gaussian(fake_heatmap, ct_inds[k],
                                            h_radiuses_alpha[k],
                                            w_radiuses_alpha[k])

                heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap)
                box_target_inds = fake_heatmap > 0
                box_target[:, box_target_inds] = gt_bbox[k][:, None]

                local_heatmap = fake_heatmap[box_target_inds]
                ct_div = np.sum(local_heatmap)
                local_heatmap *= boxes_area_topk_log[k]
                reg_weight[0, box_target_inds] = local_heatmap / ct_div
            sample['ttf_heatmap'] = heatmap
            sample['ttf_box_target'] = box_target
            sample['ttf_reg_weight'] = reg_weight
            sample.pop('is_crowd', None)
            sample.pop('difficult', None)
            sample.pop('gt_class', None)
            sample.pop('gt_bbox', None)
            sample.pop('gt_score', None)
        return samples

    def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius):
        h, w = 2 * h_radius + 1, 2 * w_radius + 1
        sigma_x = w / 6
        sigma_y = h / 6
        gaussian = gaussian2D((h, w), sigma_x, sigma_y)

        x, y = int(center[0]), int(center[1])

        height, width = heatmap.shape[0:2]

        left, right = min(x, w_radius), min(width - x, w_radius + 1)
        top, bottom = min(y, h_radius), min(height - y, h_radius + 1)

        masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
        masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius -
                                   left:w_radius + right]
        if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
            heatmap[y - top:y + bottom, x - left:x + right] = np.maximum(
                masked_heatmap, masked_gaussian)
        return heatmap


@register_op
class Gt2Solov2Target(BaseOperator):
    """Assign mask target and labels in SOLOv2 network.
    The code of this function is based on:
        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271
    Args:
        num_grids (list): The list of feature map grids size.
        scale_ranges (list): The list of mask boundary range.
        coord_sigma (float): The coefficient of coordinate area length.
        sampling_ratio (float): The ratio of down sampling.
    """

    def __init__(self,
                 num_grids=[40, 36, 24, 16, 12],
                 scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768],
                               [384, 2048]],
                 coord_sigma=0.2,
                 sampling_ratio=4.0):
        super(Gt2Solov2Target, self).__init__()
        self.num_grids = num_grids
        self.scale_ranges = scale_ranges
        self.coord_sigma = coord_sigma
        self.sampling_ratio = sampling_ratio

    def _scale_size(self, im, scale):
        h, w = im.shape[:2]
        new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5))
        resized_img = cv2.resize(
            im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
        return resized_img

    def __call__(self, samples, context=None):
        sample_id = 0
        max_ins_num = [0] * len(self.num_grids)
        for sample in samples:
            gt_bboxes_raw = sample['gt_bbox']
            gt_labels_raw = sample['gt_class'] + 1
            im_c, im_h, im_w = sample['image'].shape[:]
            gt_masks_raw = sample['gt_segm'].astype(np.uint8)
            mask_feat_size = [
                int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio)
            ]
            gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
                               (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
            ins_ind_label_list = []
            idx = 0
            for (lower_bound, upper_bound), num_grid \
                    in zip(self.scale_ranges, self.num_grids):

                hit_indices = ((gt_areas >= lower_bound) &
                               (gt_areas <= upper_bound)).nonzero()[0]
                num_ins = len(hit_indices)

                ins_label = []
                grid_order = []
                cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)

                if num_ins == 0:
                    ins_label = np.zeros(
                        [1, mask_feat_size[0], mask_feat_size[1]],
                        dtype=np.uint8)
                    ins_ind_label_list.append(ins_ind_label)
                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
                    sample['ins_label{}'.format(idx)] = ins_label
                    sample['grid_order{}'.format(idx)] = np.asarray(
                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
                    idx += 1
                    continue
                gt_bboxes = gt_bboxes_raw[hit_indices]
                gt_labels = gt_labels_raw[hit_indices]
                gt_masks = gt_masks_raw[hit_indices, ...]

                half_ws = 0.5 * (
                    gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma
                half_hs = 0.5 * (
                    gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma

                for seg_mask, gt_label, half_h, half_w in zip(
                        gt_masks, gt_labels, half_hs, half_ws):
                    if seg_mask.sum() == 0:
                        continue
                    # mass center
                    upsampled_size = (mask_feat_size[0] * 4,
                                      mask_feat_size[1] * 4)
                    center_h, center_w = ndimage.measurements.center_of_mass(
                        seg_mask)
                    coord_w = int(
                        (center_w / upsampled_size[1]) // (1. / num_grid))
                    coord_h = int(
                        (center_h / upsampled_size[0]) // (1. / num_grid))

                    # left, top, right, down
                    top_box = max(0,
                                  int(((center_h - half_h) / upsampled_size[0])
                                      // (1. / num_grid)))
                    down_box = min(num_grid - 1,
                                   int(((center_h + half_h) / upsampled_size[0])
                                       // (1. / num_grid)))
                    left_box = max(0,
                                   int(((center_w - half_w) / upsampled_size[1])
                                       // (1. / num_grid)))
                    right_box = min(num_grid - 1,
                                    int(((center_w + half_w) /
                                         upsampled_size[1]) // (1. / num_grid)))

                    top = max(top_box, coord_h - 1)
                    down = min(down_box, coord_h + 1)
                    left = max(coord_w - 1, left_box)
                    right = min(right_box, coord_w + 1)

                    cate_label[top:(down + 1), left:(right + 1)] = gt_label
                    seg_mask = self._scale_size(
                        seg_mask, scale=1. / self.sampling_ratio)
                    for i in range(top, down + 1):
                        for j in range(left, right + 1):
                            label = int(i * num_grid + j)
                            cur_ins_label = np.zeros(
                                [mask_feat_size[0], mask_feat_size[1]],
                                dtype=np.uint8)
                            cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[
                                1]] = seg_mask
                            ins_label.append(cur_ins_label)
                            ins_ind_label[label] = True
                            grid_order.append(sample_id * num_grid * num_grid +
                                              label)
                if ins_label == []:
                    ins_label = np.zeros(
                        [1, mask_feat_size[0], mask_feat_size[1]],
                        dtype=np.uint8)
                    ins_ind_label_list.append(ins_ind_label)
                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
                    sample['ins_label{}'.format(idx)] = ins_label
                    sample['grid_order{}'.format(idx)] = np.asarray(
                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
                else:
                    ins_label = np.stack(ins_label, axis=0)
                    ins_ind_label_list.append(ins_ind_label)
                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
                    sample['ins_label{}'.format(idx)] = ins_label
                    sample['grid_order{}'.format(idx)] = np.asarray(
                        grid_order, dtype=np.int32)
                    assert len(grid_order) > 0
                max_ins_num[idx] = max(
                    max_ins_num[idx],
                    sample['ins_label{}'.format(idx)].shape[0])
                idx += 1
            ins_ind_labels = np.concatenate([
                ins_ind_labels_level_img
                for ins_ind_labels_level_img in ins_ind_label_list
            ])
            fg_num = np.sum(ins_ind_labels)
            sample['fg_num'] = fg_num
            sample_id += 1

            sample.pop('is_crowd')
            sample.pop('gt_class')
            sample.pop('gt_bbox')
            sample.pop('gt_poly')
            sample.pop('gt_segm')

        # padding batch
        for data in samples:
            for idx in range(len(self.num_grids)):
                gt_ins_data = np.zeros(
                    [
                        max_ins_num[idx],
                        data['ins_label{}'.format(idx)].shape[1],
                        data['ins_label{}'.format(idx)].shape[2]
                    ],
                    dtype=np.uint8)
                gt_ins_data[0:data['ins_label{}'.format(idx)].shape[
                    0], :, :] = data['ins_label{}'.format(idx)]
                gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32)
                gt_grid_order[0:data['grid_order{}'.format(idx)].shape[
                    0]] = data['grid_order{}'.format(idx)]
                data['ins_label{}'.format(idx)] = gt_ins_data
                data['grid_order{}'.format(idx)] = gt_grid_order

        return samples


@register_op
class Gt2SparseTarget(BaseOperator):
    def __init__(self, use_padding_shape=False):
        super(Gt2SparseTarget, self).__init__()
        self.use_padding_shape = use_padding_shape

    def __call__(self, samples, context=None):
        for sample in samples:
            ori_h, ori_w = sample['h'], sample['w']
            if self.use_padding_shape:
                h, w = sample["image"].shape[1:3]
                if "scale_factor" in sample:
                    sf_w, sf_h = sample["scale_factor"][1], sample[
                        "scale_factor"][0]
                    sample["scale_factor_whwh"] = np.array(
                        [sf_w, sf_h, sf_w, sf_h], dtype=np.float32)
                else:
                    sample["scale_factor_whwh"] = np.array(
                        [1.0, 1.0, 1.0, 1.0], dtype=np.float32)
            else:
                h, w = round(sample['im_shape'][0]), round(sample['im_shape'][
                    1])
                sample["scale_factor_whwh"] = np.array(
                    [w / ori_w, h / ori_h, w / ori_w, h / ori_h],
                    dtype=np.float32)

            sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32)
            sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32)

        return samples


@register_op
class PadMaskBatch(BaseOperator):
    """
    Pad a batch of samples so that they can be divisible by a stride.
    The layout of each image should be 'CHW'.
    Args:
        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
            height and width is divisible by `pad_to_stride`.
        return_pad_mask (bool): If `return_pad_mask = True`, return
            `pad_mask` for transformer.
    """

    def __init__(self, pad_to_stride=0, return_pad_mask=True):
        super(PadMaskBatch, self).__init__()
        self.pad_to_stride = pad_to_stride
        self.return_pad_mask = return_pad_mask

    def __call__(self, samples, context=None):
        """
        Args:
            samples (list): a batch of sample, each is dict.
        """
        coarsest_stride = self.pad_to_stride

        max_shape = np.array([data['image'].shape for data in samples]).max(
            axis=0)
        if coarsest_stride > 0:
            max_shape[1] = int(
                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
            max_shape[2] = int(
                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)

        for data in samples:
            im = data['image']
            im_c, im_h, im_w = im.shape[:]
            padding_im = np.zeros(
                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
            padding_im[:, :im_h, :im_w] = im.astype(np.float32)
            data['image'] = padding_im
            if 'semantic' in data and data['semantic'] is not None:
                semantic = data['semantic']
                padding_sem = np.zeros(
                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
                padding_sem[:, :im_h, :im_w] = semantic
                data['semantic'] = padding_sem
            if 'gt_segm' in data and data['gt_segm'] is not None:
                gt_segm = data['gt_segm']
                padding_segm = np.zeros(
                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
                    dtype=np.uint8)
                padding_segm[:, :im_h, :im_w] = gt_segm
                data['gt_segm'] = padding_segm
            if self.return_pad_mask:
                padding_mask = np.zeros(
                    (max_shape[1], max_shape[2]), dtype=np.float32)
                padding_mask[:im_h, :im_w] = 1.
                data['pad_mask'] = padding_mask

        return samples


@register_op
class Gt2CenterNetTarget(BaseOperator):
    __shared__ = ['num_classes']
    """Gt2CenterNetTarget
    Genterate CenterNet targets by ground-truth
    Args:
        down_ratio (int): The down sample ratio between output feature and 
                          input image.
        num_classes (int): The number of classes, 80 by default.
        max_objs (int): The maximum objects detected, 128 by default.
    """

    def __init__(self, num_classes=80, down_ratio=4, max_objs=128):
        super(Gt2CenterNetTarget, self).__init__()
        self.nc = num_classes
        self.down_ratio = down_ratio
        self.max_objs = max_objs

    def __call__(self, sample, context=None):
        input_h, input_w = sample['image'].shape[1:]
        output_h = input_h // self.down_ratio
        output_w = input_w // self.down_ratio
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']

        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
        ind = np.zeros((self.max_objs), dtype=np.int64)
        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
        cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)
        cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)

        trans_output = get_affine_transform(
            center=sample['center'],
            input_size=[sample['scale'], sample['scale']],
            rot=0,
            output_size=[output_w, output_h])

        gt_det = []
        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
            cls = int(cls)
            bbox[:2] = affine_transform(bbox[:2], trans_output)
            bbox[2:] = affine_transform(bbox[2:], trans_output)
            bbox_amodal = copy.deepcopy(bbox)
            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
            if h > 0 and w > 0:
                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
                radius = max(0, int(radius))
                ct = np.array(
                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                    dtype=np.float32)
                ct_int = ct.astype(np.int32)

                # get hm,wh,reg,ind,ind_mask
                draw_umich_gaussian(hm[cls], ct_int, radius)
                wh[i] = 1. * w, 1. * h
                reg[i] = ct - ct_int
                ind[i] = ct_int[1] * output_w + ct_int[0]
                reg_mask[i] = 1
                cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
                cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
                gt_det.append([
                    ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2,
                    1, cls
                ])

        sample.pop('gt_bbox', None)
        sample.pop('gt_class', None)
        sample.pop('center', None)
        sample.pop('scale', None)
        sample.pop('is_crowd', None)
        sample.pop('difficult', None)

        sample['index'] = ind
        sample['index_mask'] = reg_mask
        sample['heatmap'] = hm
        sample['size'] = wh
        sample['offset'] = reg
        return sample


@register_op
class PadGT(BaseOperator):
    """
    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
    The num_max_boxes is the largest for batch.
    Args:
        return_gt_mask (bool): If true, return `pad_gt_mask`,
                                1 means bbox, 0 means no bbox.
    """

    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0, only_origin_box=False):
        super(PadGT, self).__init__()
        self.return_gt_mask = return_gt_mask
        self.pad_img = pad_img
        self.minimum_gtnum = minimum_gtnum
        self.only_origin_box = only_origin_box

    def _impad(self,
               img: np.ndarray,
               *,
               shape=None,
               padding=None,
               pad_val=0,
               padding_mode='constant') -> np.ndarray:
        """Pad the given image to a certain shape or pad on all sides with
        specified padding mode and padding value.

        Args:
            img (ndarray): Image to be padded.
            shape (tuple[int]): Expected padding shape (h, w). Default: None.
            padding (int or tuple[int]): Padding on each border. If a single int is
                provided this is used to pad all borders. If tuple of length 2 is
                provided this is the padding on left/right and top/bottom
                respectively. If a tuple of length 4 is provided this is the
                padding for the left, top, right and bottom borders respectively.
                Default: None. Note that `shape` and `padding` can not be both
                set.
            pad_val (Number | Sequence[Number]): Values to be filled in padding
                areas when padding_mode is 'constant'. Default: 0.
            padding_mode (str): Type of padding. Should be: constant, edge,
                reflect or symmetric. Default: constant.
                - constant: pads with a constant value, this value is specified
                with pad_val.
                - edge: pads with the last value at the edge of the image.
                - reflect: pads with reflection of image without repeating the last
                value on the edge. For example, padding [1, 2, 3, 4] with 2
                elements on both sides in reflect mode will result in
                [3, 2, 1, 2, 3, 4, 3, 2].
                - symmetric: pads with reflection of image repeating the last value
                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
                both sides in symmetric mode will result in
                [2, 1, 1, 2, 3, 4, 4, 3]

        Returns:
            ndarray: The padded image.
        """

        assert (shape is not None) ^ (padding is not None)
        if shape is not None:
            width = max(shape[1] - img.shape[1], 0)
            height = max(shape[0] - img.shape[0], 0)
            padding = (0, 0, int(width), int(height))

        # check pad_val
        import numbers
        if isinstance(pad_val, tuple):
            assert len(pad_val) == img.shape[-1]
        elif not isinstance(pad_val, numbers.Number):
            raise TypeError('pad_val must be a int or a tuple. '
                            f'But received {type(pad_val)}')

        # check padding
        if isinstance(padding, tuple) and len(padding) in [2, 4]:
            if len(padding) == 2:
                padding = (padding[0], padding[1], padding[0], padding[1])
        elif isinstance(padding, numbers.Number):
            padding = (padding, padding, padding, padding)
        else:
            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
                             f'But received {padding}')

        # check padding mode
        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']

        border_type = {
            'constant': cv2.BORDER_CONSTANT,
            'edge': cv2.BORDER_REPLICATE,
            'reflect': cv2.BORDER_REFLECT_101,
            'symmetric': cv2.BORDER_REFLECT
        }
        img = cv2.copyMakeBorder(
            img,
            padding[1],
            padding[3],
            padding[0],
            padding[2],
            border_type[padding_mode],
            value=pad_val)

        return img

    def checkmaxshape(self, samples):
        maxh, maxw = 0, 0
        for sample in samples:
            h, w = sample['im_shape']
            if h > maxh:
                maxh = h
            if w > maxw:
                maxw = w
        return (maxh, maxw)

    def __call__(self, samples, context=None):
        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
        if self.pad_img:
            maxshape = self.checkmaxshape(samples)
        
        if self.only_origin_box:
            for sample in samples:
                if self.pad_img:
                    img = sample['image']
                    padimg = self._impad(img, shape=maxshape)
                    sample['image'] = padimg
                if self.return_gt_mask:
                    sample['pad_origin_gt_mask'] = np.zeros(
                        (num_max_boxes, 1), dtype=np.float32)
                if num_max_boxes == 0:
                    continue
                num_gt = len(sample['origin_gt_bbox'])
                pad_origin_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
                pad_origin_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
                if num_gt > 0:
                    pad_origin_gt_class[:num_gt] = sample['origin_gt_class']
                    pad_origin_gt_bbox[:num_gt] = sample['origin_gt_bbox']
                sample['origin_gt_class'] = pad_origin_gt_class
                sample['origin_gt_bbox'] = pad_origin_gt_bbox
                if 'pad_origin_gt_mask' in sample:
                    sample['pad_origin_gt_mask'][:num_gt] = 1
        else:
            for sample in samples:
                if self.pad_img:
                    img = sample['image']
                    padimg = self._impad(img, shape=maxshape)
                    sample['image'] = padimg
                if self.return_gt_mask:
                    sample['pad_gt_mask'] = np.zeros(
                        (num_max_boxes, 1), dtype=np.float32)
                if num_max_boxes == 0:
                    continue

                num_gt = len(sample['gt_bbox'])
                pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
                pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
                if num_gt > 0:
                    pad_gt_class[:num_gt] = sample['gt_class']
                    pad_gt_bbox[:num_gt] = sample['gt_bbox']
                sample['gt_class'] = pad_gt_class
                sample['gt_bbox'] = pad_gt_bbox
                # pad_gt_mask
                if 'pad_gt_mask' in sample:
                    sample['pad_gt_mask'][:num_gt] = 1
                # gt_score
                if 'gt_score' in sample:
                    pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
                    if num_gt > 0:
                        pad_gt_score[:num_gt] = sample['gt_score']
                    sample['gt_score'] = pad_gt_score
                if 'is_crowd' in sample:
                    pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
                    if num_gt > 0:
                        pad_is_crowd[:num_gt] = sample['is_crowd']
                    sample['is_crowd'] = pad_is_crowd
                if 'difficult' in sample:
                    pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
                    if num_gt > 0:
                        pad_diff[:num_gt] = sample['difficult']
                    sample['difficult'] = pad_diff
                if 'gt_joints' in sample:
                    num_joints = sample['gt_joints'].shape[1]
                    pad_gt_joints = np.zeros(
                        (num_max_boxes, num_joints, 3), dtype=np.float32)
                    if num_gt > 0:
                        pad_gt_joints[:num_gt] = sample['gt_joints']
                    sample['gt_joints'] = pad_gt_joints
                if 'gt_areas' in sample:
                    pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
                    if num_gt > 0:
                        pad_gt_areas[:num_gt, 0] = sample['gt_areas']
                    sample['gt_areas'] = pad_gt_areas
                # gt_segm
                if 'gt_segm' in sample:
                    pad_gt_segm = np.zeros(
                        (num_max_boxes, *sample['gt_segm'].shape[-2:]),
                        dtype=np.uint8)
                    if num_gt > 0:
                        pad_gt_segm[:num_gt] = sample['gt_segm']
                    sample['gt_segm'] = pad_gt_segm.astype(np.float32)
        return samples


@register_op
class PadRGT(BaseOperator):
    """
    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
    The num_max_boxes is the largest for batch.
    Args:
        return_gt_mask (bool): If true, return `pad_gt_mask`,
                                1 means bbox, 0 means no bbox.
    """

    def __init__(self, return_gt_mask=True):
        super(PadRGT, self).__init__()
        self.return_gt_mask = return_gt_mask

    def pad_field(self, sample, field, num_gt):
        name, shape, dtype = field
        if name in sample:
            pad_v = np.zeros(shape, dtype=dtype)
            if num_gt > 0:
                pad_v[:num_gt] = sample[name]
            sample[name] = pad_v

    def __call__(self, samples, context=None):
        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
        for sample in samples:
            if self.return_gt_mask:
                sample['pad_gt_mask'] = np.zeros(
                    (num_max_boxes, 1), dtype=np.float32)
            if num_max_boxes == 0:
                continue

            num_gt = len(sample['gt_bbox'])
            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
            if num_gt > 0:
                pad_gt_class[:num_gt] = sample['gt_class']
                pad_gt_bbox[:num_gt] = sample['gt_bbox']
            sample['gt_class'] = pad_gt_class
            sample['gt_bbox'] = pad_gt_bbox
            # pad_gt_mask
            if 'pad_gt_mask' in sample:
                sample['pad_gt_mask'][:num_gt] = 1
            # gt_score
            names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox']
            dims = [1, 1, 1, 8, 5]
            dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32]

            for name, dim, dtype in zip(names, dims, dtypes):
                self.pad_field(sample, [name, (num_max_boxes, dim), dtype],
                               num_gt)

        return samples


@register_op
class Gt2CenterTrackTarget(BaseOperator):
    __shared__ = ['num_classes']
    """Gt2CenterTrackTarget
    Genterate CenterTrack targets by ground-truth
    Args:
        num_classes (int): The number of classes, 1 by default.
        down_ratio (int): The down sample ratio between output feature and 
                          input image.
        max_objs (int): The maximum objects detected, 256 by default.
    """

    def __init__(self,
                 num_classes=1,
                 down_ratio=4,
                 max_objs=256,
                 hm_disturb=0.05,
                 lost_disturb=0.4,
                 fp_disturb=0.1,
                 pre_hm=True,
                 add_tracking=True,
                 add_ltrb_amodal=True):
        super(Gt2CenterTrackTarget, self).__init__()
        self.nc = num_classes
        self.down_ratio = down_ratio
        self.max_objs = max_objs

        self.hm_disturb = hm_disturb
        self.lost_disturb = lost_disturb
        self.fp_disturb = fp_disturb
        self.pre_hm = pre_hm
        self.add_tracking = add_tracking
        self.add_ltrb_amodal = add_ltrb_amodal

    def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,
                      gt_class_pre, gt_track_id_pre):
        hm_h, hm_w = input_h, input_w
        reutrn_hm = self.pre_hm
        pre_hm = np.zeros(
            (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
        pre_cts, track_ids = [], []

        for i, (
                bbox, cls, track_id
        ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):
            cls = int(cls)
            bbox[:2] = affine_transform(bbox[:2], trans_input_pre)
            bbox[2:] = affine_transform(bbox[2:], trans_input_pre)
            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
            max_rad = 1
            if (h > 0 and w > 0):
                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
                radius = max(0, int(radius))
                max_rad = max(max_rad, radius)
                ct = np.array(
                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                    dtype=np.float32)
                ct0 = ct.copy()
                conf = 1

                ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w
                ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h
                conf = 1 if np.random.rand() > self.lost_disturb else 0

                ct_int = ct.astype(np.int32)
                if conf == 0:
                    pre_cts.append(ct / self.down_ratio)
                else:
                    pre_cts.append(ct0 / self.down_ratio)

                track_ids.append(track_id)
                if reutrn_hm:
                    draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)

                if np.random.rand() < self.fp_disturb and reutrn_hm:
                    ct2 = ct0.copy()
                    # Hard code heatmap disturb ratio, haven't tried other numbers.
                    ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
                    ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
                    ct2_int = ct2.astype(np.int32)
                    draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
        return pre_hm, pre_cts, track_ids

    def __call__(self, sample, context=None):
        input_h, input_w = sample['image'].shape[1:]
        output_h = input_h // self.down_ratio
        output_w = input_w // self.down_ratio
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']

        # init
        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
        ind = np.zeros((self.max_objs), dtype=np.int64)
        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
        if self.add_tracking:
            tr = np.zeros((self.max_objs, 2), dtype=np.float32)
        if self.add_ltrb_amodal:
            ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)

        trans_output = get_affine_transform(
            center=sample['center'],
            input_size=[sample['scale'], sample['scale']],
            rot=0,
            output_size=[output_w, output_h])

        pre_hm, pre_cts, track_ids = self._get_pre_dets(
            input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],
            sample['pre_gt_class'], sample['pre_gt_track_id'])

        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
            cls = int(cls)
            rect = np.array(
                [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],
                 [bbox[2], bbox[1]]],
                dtype=np.float32)
            for t in range(4):
                rect[t] = affine_transform(rect[t], trans_output)
                bbox[:2] = rect[:, 0].min(), rect[:, 1].min()
                bbox[2:] = rect[:, 0].max(), rect[:, 1].max()

            bbox_amodal = copy.deepcopy(bbox)
            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)

            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
            if h > 0 and w > 0:
                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
                radius = max(0, int(radius))
                ct = np.array(
                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                    dtype=np.float32)
                ct_int = ct.astype(np.int32)

                # get hm,wh,reg,ind,ind_mask
                draw_umich_gaussian(hm[cls], ct_int, radius)
                wh[i] = 1. * w, 1. * h
                reg[i] = ct - ct_int
                ind[i] = ct_int[1] * output_w + ct_int[0]
                reg_mask[i] = 1
                if self.add_tracking:
                    if sample['gt_track_id'][i] in track_ids:
                        pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][
                            i])]
                        tr[i] = pre_ct - ct_int

                if self.add_ltrb_amodal:
                    ltrb_amodal[i] = \
                        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
                        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]

        new_sample = {'image': sample['image']}
        new_sample['index'] = ind
        new_sample['index_mask'] = reg_mask
        new_sample['heatmap'] = hm
        new_sample['size'] = wh
        new_sample['offset'] = reg
        if self.add_tracking:
            new_sample['tracking'] = tr
        if self.add_ltrb_amodal:
            new_sample['ltrb_amodal'] = ltrb_amodal

        new_sample['pre_image'] = sample['pre_image']
        new_sample['pre_hm'] = pre_hm

        del sample
        return new_sample


@register_op
class BatchRandomResizeForSSOD(BaseOperator):
    """
    Resize image to target size randomly. random target_size and interpolation method
    Args:
        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
        keep_ratio (bool): whether keep_raio or not, default true
        interp (int): the interpolation method
        random_size (bool): whether random select target size of image
        random_interp (bool): whether random select interpolation method
    """

    def __init__(self,
                 target_size,
                 keep_ratio,
                 interp=cv2.INTER_NEAREST,
                 random_size=True,
                 random_interp=False):
        super(BatchRandomResizeForSSOD, self).__init__()
        self.keep_ratio = keep_ratio
        self.interps = [
            cv2.INTER_NEAREST,
            cv2.INTER_LINEAR,
            cv2.INTER_AREA,
            cv2.INTER_CUBIC,
            cv2.INTER_LANCZOS4,
        ]
        self.interp = interp
        assert isinstance(target_size, (
            int, Sequence)), "target_size must be int, list or tuple"
        if random_size and not isinstance(target_size, list):
            raise TypeError(
                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
                format(type(target_size)))
        self.target_size = target_size
        self.random_size = random_size
        self.random_interp = random_interp

    def __call__(self, samples, context=None):
        if self.random_size:
            index = np.random.choice(len(self.target_size))
            target_size = self.target_size[index]
        else:
            target_size = self.target_size
        if context is not None:
            target_size = self.target_size[context]
        if self.random_interp:
            interp = np.random.choice(self.interps)
        else:
            interp = self.interp

        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
        return [resizer(samples, context=context), index]


================================================
FILE: ppdet/data/transform/culane_operators.py
================================================
import numpy as np
import imgaug.augmenters as iaa
from .operators import BaseOperator, register_op
from ppdet.utils.logger import setup_logger
from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation

logger = setup_logger(__name__)

__all__ = [
    "CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip",
    "ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur",
    "MultiplyAndAddToBrightness", "AddToHueAndSaturation"
]


def trainTransforms(img_h, img_w):
    transforms = [{
        'name': 'Resize',
        'parameters': dict(size=dict(
            height=img_h, width=img_w)),
        'p': 1.0
    }, {
        'name': 'HorizontalFlip',
        'parameters': dict(p=1.0),
        'p': 0.5
    }, {
        'name': 'ChannelShuffle',
        'parameters': dict(p=1.0),
        'p': 0.1
    }, {
        'name': 'MultiplyAndAddToBrightness',
        'parameters': dict(
            mul=(0.85, 1.15), add=(-10, 10)),
        'p': 0.6
    }, {
        'name': 'AddToHueAndSaturation',
        'parameters': dict(value=(-10, 10)),
        'p': 0.7
    }, {
        'name': 'OneOf',
        'transforms': [
            dict(
                name='MotionBlur', parameters=dict(k=(3, 5))), dict(
                    name='MedianBlur', parameters=dict(k=(3, 5)))
        ],
        'p': 0.2
    }, {
        'name': 'Affine',
        'parameters': dict(
            translate_percent=dict(
                x=(-0.1, 0.1), y=(-0.1, 0.1)),
            rotate=(-10, 10),
            scale=(0.8, 1.2)),
        'p': 0.7
    }, {
        'name': 'Resize',
        'parameters': dict(size=dict(
            height=img_h, width=img_w)),
        'p': 1.0
    }]
    return transforms


@register_op
class CULaneTrainProcess(BaseOperator):
    def __init__(self, img_w, img_h):
        super(CULaneTrainProcess, self).__init__()
        self.img_w = img_w
        self.img_h = img_h
        self.transforms = trainTransforms(self.img_h, self.img_w)

        if self.transforms is not None:
            img_transforms = []
            for aug in self.transforms:
                p = aug['p']
                if aug['name'] != 'OneOf':
                    img_transforms.append(
                        iaa.Sometimes(
                            p=p,
                            then_list=getattr(iaa, aug['name'])(**aug[
                                'parameters'])))
                else:
                    img_transforms.append(
                        iaa.Sometimes(
                            p=p,
                            then_list=iaa.OneOf([
                                getattr(iaa, aug_['name'])(**aug_['parameters'])
                                for aug_ in aug['transforms']
                            ])))
        else:
            img_transforms = []
        self.iaa_transform = iaa.Sequential(img_transforms)

    def apply(self, sample, context=None):
        img, line_strings, seg = self.iaa_transform(
            image=sample['image'],
            line_strings=sample['lanes'],
            segmentation_maps=sample['mask'])
        sample['image'] = img
        sample['lanes'] = line_strings
        sample['mask'] = seg
        return sample


@register_op
class CULaneDataProcess(BaseOperator):
    def __init__(self, img_w, img_h, num_points, max_lanes):
        super(CULaneDataProcess, self).__init__()
        self.img_w = img_w
        self.img_h = img_h
        self.num_points = num_points
        self.n_offsets = num_points
        self.n_strips = num_points - 1
        self.strip_size = self.img_h / self.n_strips

        self.max_lanes = max_lanes
        self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size)

    def apply(self, sample, context=None):
        data = {}
        line_strings = sample['lanes']
        line_strings.clip_out_of_image_()
        new_anno = {'lanes': linestrings_to_lanes(line_strings)}

        for i in range(30):
            try:
                annos = transform_annotation(
                    self.img_w, self.img_h, self.max_lanes, self.n_offsets,
                    self.offsets_ys, self.n_strips, self.strip_size, new_anno)
                label = annos['label']
                lane_endpoints = annos['lane_endpoints']
                break
            except:
                if (i + 1) == 30:
                    logger.critical('Transform annotation failed 30 times :(')
                    exit()

        sample['image'] = sample['image'].astype(np.float32) / 255.
        data['image'] = sample['image'].transpose(2, 0, 1)
        data['lane_line'] = label
        data['seg'] = sample['seg']
        data['full_img_path'] = sample['full_img_path']
        data['img_name'] = sample['img_name']
        data['im_id'] = sample['im_id']

        if 'mask' in sample.keys():
            data['seg'] = sample['mask'].get_arr()

        data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32)
        data['scale_factor'] = np.array([1., 1.], dtype=np.float32)

        return data


@register_op
class CULaneResize(BaseOperator):
    def __init__(self, img_h, img_w, prob=0.5):
        super(CULaneResize, self).__init__()
        self.img_h = img_h
        self.img_w = img_w
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(self.prob,
                                  iaa.Resize({
                                      "height": self.img_h,
                                      "width": self.img_w
                                  }))
        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'].copy().astype(np.uint8),
                line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class HorizontalFlip(BaseOperator):
    def __init__(self, prob=0.5):
        super(HorizontalFlip, self).__init__()
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0))
        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class ChannelShuffle(BaseOperator):
    def __init__(self, prob=0.1):
        super(ChannelShuffle, self).__init__()
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0))
        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class MultiplyAndAddToBrightness(BaseOperator):
    def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5):
        super(MultiplyAndAddToBrightness, self).__init__()
        self.mul = tuple(mul)
        self.add = tuple(add)
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(
            self.prob,
            iaa.MultiplyAndAddToBrightness(
                mul=self.mul, add=self.add))
        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class AddToHueAndSaturation(BaseOperator):
    def __init__(self, value=(-10, 10), prob=0.5):
        super(AddToHueAndSaturation, self).__init__()
        self.value = tuple(value)
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(
            self.prob, iaa.AddToHueAndSaturation(value=self.value))
        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class OneOfBlur(BaseOperator):
    def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5):
        super(OneOfBlur, self).__init__()
        self.MotionBlur_k = tuple(MotionBlur_k)
        self.MedianBlur_k = tuple(MedianBlur_k)
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(
            self.prob,
            iaa.OneOf([
                iaa.MotionBlur(k=self.MotionBlur_k),
                iaa.MedianBlur(k=self.MedianBlur_k)
            ]))

        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


@register_op
class CULaneAffine(BaseOperator):
    def __init__(self,
                 translate_percent_x=(-0.1, 0.1),
                 translate_percent_y=(-0.1, 0.1),
                 rotate=(3, 5),
                 scale=(0.8, 1.2),
                 prob=0.5):
        super(CULaneAffine, self).__init__()
        self.translate_percent = {
            'x': tuple(translate_percent_x),
            'y': tuple(translate_percent_y)
        }
        self.rotate = tuple(rotate)
        self.scale = tuple(scale)
        self.prob = prob

    def apply(self, sample, context=None):
        transform = iaa.Sometimes(
            self.prob,
            iaa.Affine(
                translate_percent=self.translate_percent,
                rotate=self.rotate,
                scale=self.scale))

        if 'mask' in sample.keys():
            img, line_strings, seg = transform(
                image=sample['image'],
                line_strings=sample['lanes'],
                segmentation_maps=sample['mask'])
            sample['image'] = img
            sample['lanes'] = line_strings
            sample['mask'] = seg
        else:
            img, line_strings = transform(
                image=sample['image'], line_strings=sample['lanes'])
            sample['image'] = img
            sample['lanes'] = line_strings

        return sample


================================================
FILE: ppdet/data/transform/gridmask_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on:
# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import numpy as np
from PIL import Image


class Gridmask(object):
    def __init__(self,
                 use_h=True,
                 use_w=True,
                 rotate=1,
                 offset=False,
                 ratio=0.5,
                 mode=1,
                 prob=0.7,
                 upper_iter=360000):
        super(Gridmask, self).__init__()
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode = mode
        self.prob = prob
        self.st_prob = prob
        self.upper_iter = upper_iter

    def __call__(self, x, curr_iter):
        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
        if np.random.rand() > self.prob:
            return x
        h, w, _ = x.shape
        hh = int(1.5 * h)
        ww = int(1.5 * w)
        d = np.random.randint(2, h)
        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
        mask = np.ones((hh, ww), np.float32)
        st_h = np.random.randint(d)
        st_w = np.random.randint(d)
        if self.use_h:
            for i in range(hh // d):
                s = d * i + st_h
                t = min(s + self.l, hh)
                mask[s:t, :] *= 0
        if self.use_w:
            for i in range(ww // d):
                s = d * i + st_w
                t = min(s + self.l, ww)
                mask[:, s:t] *= 0

        r = np.random.randint(self.rotate)
        mask = Image.fromarray(np.uint8(mask))
        mask = mask.rotate(r)
        mask = np.asarray(mask)
        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2
                    + w].astype(np.float32)

        if self.mode == 1:
            mask = 1 - mask
        mask = np.expand_dims(mask, axis=-1)
        if self.offset:
            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
            x = (x * mask + offset * (1 - mask)).astype(x.dtype)
        else:
            x = (x * mask).astype(x.dtype)

        return x


================================================
FILE: ppdet/data/transform/keypoint_operators.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# function:
#    operators to process sample,
#    eg: decode/resize/crop image

from __future__ import absolute_import

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

import cv2
import numpy as np
import math
import copy

from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
from ppdet.core.workspace import serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

registered_ops = []

__all__ = [
    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
    'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation',
    'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
    'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
    'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
    'FlipPose', 'PETR_Resize'
]


def register_keypointop(cls):
    return serializable(cls)


@register_keypointop
class KeyPointFlip(object):
    """Get the fliped image by flip_prob. flip the coords also
    the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped

    Args:
        flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16]
        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
        flip_prob (float): the ratio whether to flip the image
        records(dict): the dict contained the image, mask and coords

    Returns:
        records(dict): contain the image, mask and coords after tranformed

    """

    def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):
        super(KeyPointFlip, self).__init__()
        assert isinstance(flip_permutation, Sequence)
        self.flip_permutation = flip_permutation
        self.flip_prob = flip_prob
        self.hmsize = hmsize

    def _flipjoints(self, records, sizelst):
        '''
        records['gt_joints'] is Sequence in higherhrnet
        '''
        if not ('gt_joints' in records and len(records['gt_joints']) > 0):
            return records

        kpts_lst = records['gt_joints']
        if isinstance(kpts_lst, Sequence):
            for idx, hmsize in enumerate(sizelst):
                if kpts_lst[idx].ndim == 3:
                    kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
                else:
                    kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
                kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
        else:
            hmsize = sizelst[0]
            if kpts_lst.ndim == 3:
                kpts_lst = kpts_lst[:, self.flip_permutation]
            else:
                kpts_lst = kpts_lst[self.flip_permutation]
            kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]

        records['gt_joints'] = kpts_lst
        return records

    def _flipmask(self, records, sizelst):
        if not 'mask' in records:
            return records

        mask_lst = records['mask']
        for idx, hmsize in enumerate(sizelst):
            if len(mask_lst) > idx:
                mask_lst[idx] = mask_lst[idx][:, ::-1]
        records['mask'] = mask_lst
        return records

    def _flipbbox(self, records, sizelst):
        if not 'gt_bbox' in records:
            return records

        bboxes = records['gt_bbox']
        hmsize = sizelst[0]
        bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]
        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)
        records['gt_bbox'] = bboxes
        return records

    def __call__(self, records):
        flip = np.random.random() < self.flip_prob
        if flip:
            image = records['image']
            image = image[:, ::-1]
            records['image'] = image
            if self.hmsize is None:
                sizelst = [image.shape[1]]
            else:
                sizelst = self.hmsize
            self._flipjoints(records, sizelst)
            self._flipmask(records, sizelst)
            self._flipbbox(records, sizelst)

        return records


@register_keypointop
class RandomAffine(object):
    """apply affine transform to image, mask and coords
    to achieve the rotate, scale and shift effect for training image

    Args:
        max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree]
        max_scale (list[2]): the scale range to apply, transform range is [min, max]
        max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
        trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
        scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
        records(dict): the dict contained the image, mask and coords

    Returns:
        records(dict): contain the image, mask and coords after tranformed

    """

    def __init__(self,
                 max_degree=30,
                 scale=[0.75, 1.5],
                 max_shift=0.2,
                 hmsize=None,
                 trainsize=[512, 512],
                 scale_type='short',
                 boldervalue=[114, 114, 114]):
        super(RandomAffine, self).__init__()
        self.max_degree = max_degree
        self.min_scale = scale[0]
        self.max_scale = scale[1]
        self.max_shift = max_shift
        self.hmsize = hmsize
        self.trainsize = trainsize
        self.scale_type = scale_type
        self.boldervalue = boldervalue

    def _get_affine_matrix_old(self, center, scale, res, rot=0):
        """Generate transformation matrix."""
        h = scale
        t = np.zeros((3, 3), dtype=np.float32)
        t[0, 0] = float(res[1]) / h
        t[1, 1] = float(res[0]) / h
        t[0, 2] = res[1] * (-float(center[0]) / h + .5)
        t[1, 2] = res[0] * (-float(center[1]) / h + .5)
        t[2, 2] = 1
        if rot != 0:
            rot = -rot  # To match direction of rotation from cropping
            rot_mat = np.zeros((3, 3), dtype=np.float32)
            rot_rad = rot * np.pi / 180
            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
            rot_mat[0, :2] = [cs, -sn]
            rot_mat[1, :2] = [sn, cs]
            rot_mat[2, 2] = 1
            # Need to rotate around center
            t_mat = np.eye(3)
            t_mat[0, 2] = -res[1] / 2
            t_mat[1, 2] = -res[0] / 2
            t_inv = t_mat.copy()
            t_inv[:2, 2] *= -1
            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
        return t

    def _get_affine_matrix(self, center, scale, res, rot=0):
        """Generate transformation matrix."""
        w, h = scale
        t = np.zeros((3, 3), dtype=np.float32)
        t[0, 0] = float(res[0]) / w
        t[1, 1] = float(res[1]) / h
        t[0, 2] = res[0] * (-float(center[0]) / w + .5)
        t[1, 2] = res[1] * (-float(center[1]) / h + .5)
        t[2, 2] = 1
        if rot != 0:
            rot = -rot  # To match direction of rotation from cropping
            rot_mat = np.zeros((3, 3), dtype=np.float32)
            rot_rad = rot * np.pi / 180
            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
            rot_mat[0, :2] = [cs, -sn]
            rot_mat[1, :2] = [sn, cs]
            rot_mat[2, 2] = 1
            # Need to rotate around center
            t_mat = np.eye(3)
            t_mat[0, 2] = -res[0] / 2
            t_mat[1, 2] = -res[1] / 2
            t_inv = t_mat.copy()
            t_inv[:2, 2] *= -1
            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
        return t

    def _affine_joints_mask(self,
                            degree,
                            center,
                            roi_size,
                            dsize,
                            keypoints=None,
                            heatmap_mask=None,
                            gt_bbox=None):
        kpts = None
        mask = None
        bbox = None
        mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
                                                  degree)[:2]
        if heatmap_mask is not None:
            mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)
            mask = ((mask / 255) > 0.5).astype(np.float32)
        if keypoints is not None:
            kpts = copy.deepcopy(keypoints)
            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
                                                mask_affine_mat)
            kpts[(kpts[..., 0]) > dsize[0], :] = 0
            kpts[(kpts[..., 1]) > dsize[1], :] = 0
            kpts[(kpts[..., 0]) < 0, :] = 0
            kpts[(kpts[..., 1]) < 0, :] = 0
        if gt_bbox is not None:
            temp_bbox = gt_bbox[:, [0, 3, 2, 1]]
            cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)
            gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)
            bbox = np.zeros_like(gt_bbox)
            bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])
            bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])
            bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])
            bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])
        return kpts, mask, bbox

    def __call__(self, records):
        image = records['image']
        shape = np.array(image.shape[:2][::-1])
        keypoints = None
        heatmap_mask = None
        gt_bbox = None
        if 'gt_joints' in records:
            keypoints = records['gt_joints']

        if 'mask' in records:
            heatmap_mask = records['mask']
            heatmap_mask *= 255

        if 'gt_bbox' in records:
            gt_bbox = records['gt_bbox']

        degree = (np.random.random() * 2 - 1) * self.max_degree
        center = center = np.array((np.array(shape) / 2))

        aug_scale = np.random.random() * (self.max_scale - self.min_scale
                                          ) + self.min_scale
        if self.scale_type == 'long':
            scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)
        elif self.scale_type == 'short':
            scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)
        elif self.scale_type == 'wh':
            scale = shape
        else:
            raise ValueError('Unknown scale type: {}'.format(self.scale_type))
        roi_size = aug_scale * scale
        dx = int(0)
        dy = int(0)
        if self.max_shift > 0:

            dx = np.random.randint(-self.max_shift * roi_size[0],
                                   self.max_shift * roi_size[0])
            dy = np.random.randint(-self.max_shift * roi_size[0],
                                   self.max_shift * roi_size[1])

        center += np.array([dx, dy])
        input_size = 2 * center
        if self.trainsize != -1:
            dsize = self.trainsize
            imgshape = (dsize)
        else:
            dsize = scale
            imgshape = (shape.tolist())

        image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
                                                   degree)[:2]
        image = cv2.warpAffine(
            image,
            image_affine_mat,
            imgshape,
            flags=cv2.INTER_LINEAR,
            borderValue=self.boldervalue)

        if self.hmsize is None:
            kpts, mask, gt_bbox = self._affine_joints_mask(
                degree, center, roi_size, dsize, keypoints, heatmap_mask,
                gt_bbox)
            records['image'] = image
            if kpts is not None: records['gt_joints'] = kpts
            if mask is not None: records['mask'] = mask
            if gt_bbox is not None: records['gt_bbox'] = gt_bbox
            return records

        kpts_lst = []
        mask_lst = []
        for hmsize in self.hmsize:
            kpts, mask, gt_bbox = self._affine_joints_mask(
                degree, center, roi_size, [hmsize, hmsize], keypoints,
                heatmap_mask, gt_bbox)
            kpts_lst.append(kpts)
            mask_lst.append(mask)
        records['image'] = image

        if 'gt_joints' in records:
            records['gt_joints'] = kpts_lst
        if 'mask' in records:
            records['mask'] = mask_lst
        if 'gt_bbox' in records:
            records['gt_bbox'] = gt_bbox
        return records


@register_keypointop
class EvalAffine(object):
    """apply affine transform to image
    resize the short of [h,w] to standard size for eval

    Args:
        size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard
        records(dict): the dict contained the image, mask and coords

    Returns:
        records(dict): contain the image, mask and coords after tranformed

    """

    def __init__(self, size, stride=64):
        super(EvalAffine, self).__init__()
        self.size = size
        self.stride = stride

    def __call__(self, records):
        image = records['image']
        mask = records['mask'] if 'mask' in records else None
        s = self.size
        h, w, _ = image.shape
        trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)
        image_resized = cv2.warpAffine(image, trans, size_resized)
        if mask is not None:
            mask = cv2.warpAffine(mask, trans, size_resized)
            records['mask'] = mask
        if 'gt_joints' in records:
            del records['gt_joints']
        records['image'] = image_resized
        records['scale_factor'] = self.size / min(h, w)
        return records


@register_keypointop
class NormalizePermute(object):
    def __init__(self,
                 mean=[123.675, 116.28, 103.53],
                 std=[58.395, 57.120, 57.375],
                 is_scale=True):
        super(NormalizePermute, self).__init__()
        self.mean = mean
        self.std = std
        self.is_scale = is_scale

    def __call__(self, records):
        image = records['image']
        image = image.astype(np.float32)
        if self.is_scale:
            image /= 255.
        image = image.transpose((2, 0, 1))
        mean = np.array(self.mean, dtype=np.float32)
        std = np.array(self.std, dtype=np.float32)
        invstd = 1. / std
        for v, m, s in zip(image, mean, invstd):
            v.__isub__(m).__imul__(s)
        records['image'] = image
        return records


@register_keypointop
class TagGenerate(object):
    """record gt coords for aeloss to sample coords value in tagmaps

    Args:
        num_joints (int): the keypoint numbers of dataset to train
        num_people (int): maxmum people to support for sample aeloss
        records(dict): the dict contained the image, mask and coords

    Returns:
        records(dict): contain the gt coords used in tagmap

    """

    def __init__(self, num_joints, max_people=30):
        super(TagGenerate, self).__init__()
        self.max_people = max_people
        self.num_joints = num_joints

    def __call__(self, records):
        kpts_lst = records['gt_joints']
        kpts = kpts_lst[0]
        tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
        inds = np.where(kpts[..., 2] > 0)
        p, j = inds[0], inds[1]
        visible = kpts[inds]
        # tagmap is [p, j, 3], where last dim is j, y, x
        tagmap[p, j, 0] = j
        tagmap[p, j, 1] = visible[..., 1]  # y
        tagmap[p, j, 2] = visible[..., 0]  # x
        tagmap[p, j, 3] = 1
        records['tagmap'] = tagmap
        del records['gt_joints']
        return records


@register_keypointop
class ToHeatmaps(object):
    """to generate the gaussin heatmaps of keypoint for heatmap loss

    Args:
        num_joints (int): the keypoint numbers of dataset to train
        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
        sigma (float): the std of gaussin kernel genereted
        records(dict): the dict contained the image, mask and coords

    Returns:
        records(dict): contain the heatmaps used to heatmaploss

    """

    def __init__(self, num_joints, hmsize, sigma=None):
        super(ToHeatmaps, self).__init__()
        self.num_joints = num_joints
        self.hmsize = np.array(hmsize)
        if sigma is None:
            sigma = hmsize[0] // 64
        self.sigma = sigma

        r = 6 * sigma + 3
        x = np.arange(0, r, 1, np.float32)
        y = x[:, None]
        x0, y0 = 3 * sigma + 1, 3 * sigma + 1
        self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))

    def __call__(self, records):
        kpts_lst = records['gt_joints']
        mask_lst = records['mask']
        for idx, hmsize in enumerate(self.hmsize):
            mask = mask_lst[idx]
            kpts = kpts_lst[idx]
            heatmaps = np.zeros((self.num_joints, hmsize, hmsize))
            inds = np.where(kpts[..., 2] > 0)
            visible = kpts[inds].astype(np.int64)[..., :2]
            ul = np.round(visible - 3 * self.sigma - 1)
            br = np.round(visible + 3 * self.sigma + 2)
            sul = np.maximum(0, -ul)
            sbr = np.minimum(hmsize, br) - ul
            dul = np.clip(ul, 0, hmsize - 1)
            dbr = np.clip(br, 0, hmsize)
            for i in range(len(visible)):
                if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][
                        0] >= hmsize or visible[i][1] >= hmsize:
                    continue
                dx1, dy1 = dul[i]
                dx2, dy2 = dbr[i]
                sx1, sy1 = sul[i]
                sx2, sy2 = sbr[i]
                heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum(
                    self.gaussian[sy1:sy2, sx1:sx2],
                    heatmaps[inds[1][i], dy1:dy2, dx1:dx2])
            records['heatmap_gt{}x'.format(idx + 1)] = heatmaps
            records['mask_{}x'.format(idx + 1)] = mask
        del records['mask']
        return records


@register_keypointop
class RandomFlipHalfBodyTransform(object):
    """apply data augment to image and coords
    to achieve the flip, scale, rotate and half body transform effect for training image

    Args:
        trainsize (list):[w, h], Image target size
        upper_body_ids (list): The upper body joint ids
        flip_pairs (list): The left-right joints exchange order list
        pixel_std (int): The pixel std of the scale
        scale (float): The scale factor to transform the image
        rot (int): The rotate factor to transform the image
        num_joints_half_body (int): The joints threshold of the half body transform
        prob_half_body (float): The threshold of the half body transform
        flip (bool): Whether to flip the image

    Returns:
        records(dict): contain the image and coords after tranformed

    """

    def __init__(self,
                 trainsize,
                 upper_body_ids,
                 flip_pairs,
                 pixel_std,
                 scale=0.35,
                 rot=40,
                 num_joints_half_body=8,
                 prob_half_body=0.3,
                 flip=True,
                 rot_prob=0.6):
        super(RandomFlipHalfBodyTransform, self).__init__()
        self.trainsize = trainsize
        self.upper_body_ids = upper_body_ids
        self.flip_pairs = flip_pairs
        self.pixel_std = pixel_std
        self.scale = scale
        self.rot = rot
        self.num_joints_half_body = num_joints_half_body
        self.prob_half_body = prob_half_body
        self.flip = flip
        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
        self.rot_prob = rot_prob

    def halfbody_transform(self, joints, joints_vis):
        upper_joints = []
        lower_joints = []
        for joint_id in range(joints.shape[0]):
            if joints_vis[joint_id][0] > 0:
                if joint_id in self.upper_body_ids:
                    upper_joints.append(joints[joint_id])
                else:
                    lower_joints.append(joints[joint_id])
        if np.random.randn() < 0.5 and len(upper_joints) > 2:
            selected_joints = upper_joints
        else:
            selected_joints = lower_joints if len(
                lower_joints) > 2 else upper_joints
        if len(selected_joints) < 2:
            return None, None
        selected_joints = np.array(selected_joints, dtype=np.float32)
        center = selected_joints.mean(axis=0)[:2]
        left_top = np.amin(selected_joints, axis=0)
        right_bottom = np.amax(selected_joints, axis=0)
        w = right_bottom[0] - left_top[0]
        h = right_bottom[1] - left_top[1]
        if w > self.aspect_ratio * h:
            h = w * 1.0 / self.aspect_ratio
        elif w < self.aspect_ratio * h:
            w = h * self.aspect_ratio
        scale = np.array(
            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
            dtype=np.float32)
        scale = scale * 1.5

        return center, scale

    def flip_joints(self, joints, joints_vis, width, matched_parts):
        joints[:, 0] = width - joints[:, 0] - 1
        for pair in matched_parts:
            joints[pair[0], :], joints[pair[1], :] = \
                joints[pair[1], :], joints[pair[0], :].copy()
            joints_vis[pair[0], :], joints_vis[pair[1], :] = \
                joints_vis[pair[1], :], joints_vis[pair[0], :].copy()

        return joints * joints_vis, joints_vis

    def __call__(self, records):
        image = records['image']
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        c = records['center']
        s = records['scale']
        r = 0
        if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and
                np.random.rand() < self.prob_half_body):
            c_half_body, s_half_body = self.halfbody_transform(joints,
                                                               joints_vis)
            if c_half_body is not None and s_half_body is not None:
                c, s = c_half_body, s_half_body
        sf = self.scale
        rf = self.rot
        s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
        r = np.clip(np.random.randn() * rf, -rf * 2,
                    rf * 2) if np.random.random() <= self.rot_prob else 0

        if self.flip and np.random.random() <= 0.5:
            image = image[:, ::-1, :]
            joints, joints_vis = self.flip_joints(
                joints, joints_vis, image.shape[1], self.flip_pairs)
            c[0] = image.shape[1] - c[0] - 1
        records['image'] = image
        records['gt_joints'] = joints
        records['joints_vis'] = joints_vis
        records['center'] = c
        records['scale'] = s
        records['rotate'] = r

        return records


@register_keypointop
class AugmentationbyInformantionDropping(object):
    """AID: Augmentation by Informantion Dropping. Please refer 
        to https://arxiv.org/abs/2008.07139 
    
    Args:
        prob_cutout (float): The probability of the Cutout augmentation.
        offset_factor (float): Offset factor of cutout center.
        num_patch (int): Number of patches to be cutout.                       
        records(dict): the dict contained the image and coords
        
    Returns:
        records (dict): contain the image and coords after tranformed
    
    """

    def __init__(self,
                 trainsize,
                 prob_cutout=0.0,
                 offset_factor=0.2,
                 num_patch=1):
        self.prob_cutout = prob_cutout
        self.offset_factor = offset_factor
        self.num_patch = num_patch
        self.trainsize = trainsize

    def _cutout(self, img, joints, joints_vis):
        height, width, _ = img.shape
        img = img.reshape((height * width, -1))
        feat_x_int = np.arange(0, width)
        feat_y_int = np.arange(0, height)
        feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
        feat_x_int = feat_x_int.reshape((-1, ))
        feat_y_int = feat_y_int.reshape((-1, ))
        for _ in range(self.num_patch):
            vis_idx, _ = np.where(joints_vis > 0)
            occlusion_joint_id = np.random.choice(vis_idx)
            center = joints[occlusion_joint_id, 0:2]
            offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor
            center = center + offset
            radius = np.random.uniform(0.1, 0.2) * self.trainsize[0]
            x_offset = (center[0] - feat_x_int) / radius
            y_offset = (center[1] - feat_y_int) / radius
            dis = x_offset**2 + y_offset**2
            keep_pos = np.where((dis <= 1) & (dis >= 0))[0]
            img[keep_pos, :] = 0
        img = img.reshape((height, width, -1))
        return img

    def __call__(self, records):
        img = records['image']
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        if np.random.rand() < self.prob_cutout:
            img = self._cutout(img, joints, joints_vis)
        records['image'] = img
        return records


@register_keypointop
class TopDownRandomFlip(object):
    """Data augmentation with random image flip.

    Args:
        flip_perm: (list[tuple]): Pairs of keypoints which are mirrored
                (for example, left ear and right ear).
        flip_prob (float): Probability of flip.
    """

    def __init__(self, flip_perm=[], flip_prob=0.5):
        self.flip_perm = flip_perm
        self.flip_prob = flip_prob

    def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs):
        assert len(joints_3d) == len(joints_3d_visible)
        assert img_width > 0

        joints_3d_flipped = joints_3d.copy()
        joints_3d_visible_flipped = joints_3d_visible.copy()

        # Swap left-right parts
        for left, right in flip_pairs:
            joints_3d_flipped[left, :] = joints_3d[right, :]
            joints_3d_flipped[right, :] = joints_3d[left, :]

            joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
            joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]

        # Flip horizontally
        joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
        joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)

        return joints_3d_flipped, joints_3d_visible_flipped

    def __call__(self, results):
        """Perform data augmentation with random image flip."""
        if np.random.rand() <= self.flip_prob:
            return results

        img = results['image']
        joints_3d = results['gt_joints']
        joints_3d_visible = results['joints_vis']
        center = results['center']

        # A flag indicating whether the image is flipped,
        # which can be used by child class.
        if not isinstance(img, list):
            img = img[:, ::-1, :]
        else:
            img = [i[:, ::-1, :] for i in img]
        if not isinstance(img, list):
            joints_3d, joints_3d_visible = self.flip_joints(
                joints_3d, joints_3d_visible, img.shape[1],
                self.flip_perm)
            center[0] = img.shape[1] - center[0] - 1
        else:
            joints_3d, joints_3d_visible = self.flip_joints(
                joints_3d, joints_3d_visible, img[0].shape[1],
                self.flip_perm)
            center[0] = img[0].shape[1] - center[0] - 1

        results['image'] = img
        results['gt_joints'] = joints_3d
        results['joints_vis'] = joints_3d_visible
        results['center'] = center

        return results


@register_keypointop
class TopDownRandomShiftBboxCenter(object):
    """Random shift the bbox center.

    Args:
        shift_factor (float): The factor to control the shift range, which is
            scale*pixel_std*scale_factor. Default: 0.16
        shift_prob (float): Probability of applying random shift. Default: 0.3
    """

    def __init__(self, shift_factor=0.16, shift_prob=0.3):
        self.shift_factor = shift_factor
        self.shift_prob = shift_prob

    def __call__(self, results):
        center = results['center']
        scale = results['scale']
        if np.random.rand() < self.shift_prob:
            center += np.random.uniform(
                -1, 1, 2) * self.shift_factor * scale * 200.0

        results['center'] = center
        return results

@register_keypointop
class TopDownGetRandomScaleRotation(object):
    """Data augmentation with random scaling & rotating.

    Args:
        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
        rot_prob (float): Probability of random rotation.
    """

    def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
        self.rot_factor = rot_factor
        self.scale_factor = scale_factor
        self.rot_prob = rot_prob

    def __call__(self, results):
        """Perform data augmentation with random scaling & rotating."""
        s = results['scale']

        sf = self.scale_factor
        rf = self.rot_factor

        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
        s = s * s_factor

        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
        r = r_factor if np.random.rand() <= self.rot_prob else 0

        results['scale'] = s
        results['rotate'] = r

        return results


@register_keypointop
class TopDownAffine(object):
    """apply affine transform to image and coords

    Args:
        trainsize (list): [w, h], the standard size used to train
        use_udp (bool): whether to use Unbiased Data Processing.
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the image and coords after tranformed

    """

    def __init__(self, trainsize, use_udp=False):
        self.trainsize = trainsize
        self.use_udp = use_udp

    def __call__(self, records):
        image = records['image']
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        rot = records['rotate'] if "rotate" in records else 0
        if self.use_udp:
            trans = get_warp_matrix(
                rot, records['center'] * 2.0,
                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
                records['scale'] * 200.0)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
            joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)
        else:
            trans = get_affine_transform(records['center'], records['scale'] *
                                         200, rot, self.trainsize)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
            for i in range(joints.shape[0]):
                if joints_vis[i, 0] > 0.0:
                    joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)

        records['image'] = image
        records['gt_joints'] = joints

        return records


@register_keypointop
class SinglePoseAffine(object):
    """apply affine transform to image and coords

    Args:
        trainsize (list): [w, h], the standard size used to train
        use_udp (bool): whether to use Unbiased Data Processing.
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the image and coords after tranformed

    """

    def __init__(self,
                 trainsize,
                 rotate=[1.0, 30],
                 scale=[1.0, 0.25],
                 use_udp=False):
        self.trainsize = trainsize
        self.use_udp = use_udp
        self.rot_prob = rotate[0]
        self.rot_range = rotate[1]
        self.scale_prob = scale[0]
        self.scale_ratio = scale[1]

    def __call__(self, records):
        image = records['image']
        if 'joints_2d' in records:
            joints = records['joints_2d'] if 'joints_2d' in records else None
            joints_vis = records[
                'joints_vis'] if 'joints_vis' in records else np.ones(
                    (len(joints), 1))
        rot = 0
        s = 1.
        if np.random.random() < self.rot_prob:
            rot = np.clip(np.random.randn() * self.rot_range,
                          -self.rot_range * 2, self.rot_range * 2)
        if np.random.random() < self.scale_prob:
            s = np.clip(np.random.randn() * self.scale_ratio + 1,
                        1 - self.scale_ratio, 1 + self.scale_ratio)

        if self.use_udp:
            trans = get_warp_matrix(
                rot,
                np.array(records['bbox_center']) * 2.0,
                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
                records['bbox_scale'] * 200.0 * s)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
            if 'joints_2d' in records:
                joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),
                                                    trans)
        else:
            trans = get_affine_transform(
                np.array(records['bbox_center']),
                records['bbox_scale'] * s * 200, rot, self.trainsize)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
            if 'joints_2d' in records:
                for i in range(len(joints)):
                    if joints_vis[i, 0] > 0.0:
                        joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)

        if 'joints_3d' in records:
            pose3d = records['joints_3d']
            if not rot == 0:
                trans_3djoints = np.eye(3)
                rot_rad = -rot * np.pi / 180
                sn, cs = np.sin(rot_rad), np.cos(rot_rad)
                trans_3djoints[0, :2] = [cs, -sn]
                trans_3djoints[1, :2] = [sn, cs]
                pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,
                                          pose3d[:, :3])
                records['joints_3d'] = pose3d

        records['image'] = image
        if 'joints_2d' in records:
            records['joints_2d'] = joints

        return records


@register_keypointop
class NoiseJitter(object):
    """apply NoiseJitter to image

    Args:
        noise_factor (float): the noise factor ratio used to generate the jitter

    Returns:
        records (dict): contain the image and coords after tranformed

    """

    def __init__(self, noise_factor=0.4):
        self.noise_factor = noise_factor

    def __call__(self, records):
        self.pn = np.random.uniform(1 - self.noise_factor,
                                    1 + self.noise_factor, 3)
        rgb_img = records['image']
        rgb_img[:, :, 0] = np.minimum(
            255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))
        rgb_img[:, :, 1] = np.minimum(
            255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))
        rgb_img[:, :, 2] = np.minimum(
            255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))
        records['image'] = rgb_img
        return records


@register_keypointop
class FlipPose(object):
    """random apply flip to image

    Args:
        noise_factor (float): the noise factor ratio used to generate the jitter

    Returns:
        records (dict): contain the image and coords after tranformed

    """

    def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):
        self.flip_pob = flip_prob
        self.img_res = img_res
        if num_joints == 24:
            self.perm = [
                5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,
                18, 19, 21, 20, 23, 22
            ]
        elif num_joints == 14:
            self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]
        else:
            print("error num_joints in flip :{}".format(num_joints))

    def __call__(self, records):

        if np.random.random() < self.flip_pob:
            img = records['image']
            img = np.fliplr(img)

            if 'joints_2d' in records:
                joints_2d = records['joints_2d']
                joints_2d = joints_2d[self.perm]
                joints_2d[:, 0] = self.img_res - joints_2d[:, 0]
                records['joints_2d'] = joints_2d

            if 'joints_3d' in records:
                joints_3d = records['joints_3d']
                joints_3d = joints_3d[self.perm]
                joints_3d[:, 0] = -joints_3d[:, 0]
                records['joints_3d'] = joints_3d

            records['image'] = img
        return records


@register_keypointop
class TopDownEvalAffine(object):
    """apply affine transform to image and coords

    Args:
        trainsize (list): [w, h], the standard size used to train
        use_udp (bool): whether to use Unbiased Data Processing.
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the image and coords after tranformed

    """

    def __init__(self, trainsize, use_udp=False):
        self.trainsize = trainsize
        self.use_udp = use_udp

    def __call__(self, records):
        image = records['image']
        rot = 0
        imshape = records['im_shape'][::-1]
        center = imshape / 2.
        scale = imshape

        if self.use_udp:
            trans = get_warp_matrix(
                rot, center * 2.0,
                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
        else:
            trans = get_affine_transform(center, scale, rot, self.trainsize)
            image = cv2.warpAffine(
                image,
                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR)
        records['image'] = image

        return records


@register_keypointop
class ToHeatmapsTopDown(object):
    """to generate the gaussin heatmaps of keypoint for heatmap loss

    Args:
        hmsize (list): [w, h] output heatmap's size
        sigma (float): the std of gaussin kernel genereted
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the heatmaps used to heatmaploss

    """

    def __init__(self, hmsize, sigma):
        super(ToHeatmapsTopDown, self).__init__()
        self.hmsize = np.array(hmsize)
        self.sigma = sigma

    def __call__(self, records):
        """refer to
            https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
            Copyright (c) Microsoft, under the MIT License.
        """
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        num_joints = joints.shape[0]
        image_size = np.array(
            [records['image'].shape[1], records['image'].shape[0]])
        target_weight = np.ones((num_joints, 1), dtype=np.float32)
        target_weight[:, 0] = joints_vis[:, 0]
        target = np.zeros(
            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
        tmp_size = self.sigma * 3
        feat_stride = image_size / self.hmsize
        for joint_id in range(num_joints):
            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
            # Check that any part of the gaussian is in-bounds
            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
                    0] < 0 or br[1] < 0:
                # If not, just return the image as is
                target_weight[joint_id] = 0
                continue
            # # Generate gaussian
            size = 2 * tmp_size + 1
            x = np.arange(0, size, 1, np.float32)
            y = x[:, np.newaxis]
            x0 = y0 = size // 2
            # The gaussian is not normalized, we want the center value to equal 1
            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))

            # Usable gaussian range
            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
            # Image range
            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])

            v = target_weight[joint_id]
            if v > 0.5:
                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
                    0]:g_y[1], g_x[0]:g_x[1]]
        records['target'] = target
        records['target_weight'] = target_weight
        del records['gt_joints'], records['joints_vis']

        return records


@register_keypointop
class ToHeatmapsTopDown_DARK(object):
    """to generate the gaussin heatmaps of keypoint for heatmap loss

    Args:
        hmsize (list): [w, h] output heatmap's size
        sigma (float): the std of gaussin kernel genereted
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the heatmaps used to heatmaploss

    """

    def __init__(self, hmsize, sigma):
        super(ToHeatmapsTopDown_DARK, self).__init__()
        self.hmsize = np.array(hmsize)
        self.sigma = sigma

    def __call__(self, records):
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        num_joints = joints.shape[0]
        image_size = np.array(
            [records['image'].shape[1], records['image'].shape[0]])
        target_weight = np.ones((num_joints, 1), dtype=np.float32)
        target_weight[:, 0] = joints_vis[:, 0]
        target = np.zeros(
            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
        tmp_size = self.sigma * 3
        feat_stride = image_size / self.hmsize
        for joint_id in range(num_joints):
            mu_x = joints[joint_id][0] / feat_stride[0]
            mu_y = joints[joint_id][1] / feat_stride[1]
            # Check that any part of the gaussian is in-bounds
            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
                    0] < 0 or br[1] < 0:
                # If not, just return the image as is
                target_weight[joint_id] = 0
                continue

            x = np.arange(0, self.hmsize[0], 1, np.float32)
            y = np.arange(0, self.hmsize[1], 1, np.float32)
            y = y[:, np.newaxis]

            v = target_weight[joint_id]
            if v > 0.5:
                target[joint_id] = np.exp(-(
                    (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
        records['target'] = target
        records['target_weight'] = target_weight
        del records['gt_joints'], records['joints_vis']

        return records


@register_keypointop
class ToHeatmapsTopDown_UDP(object):
    """This code is based on:
        https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py
       
        to generate the gaussian heatmaps of keypoint for heatmap loss.
        ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing
        for Human Pose Estimation (CVPR 2020).

    Args:
        hmsize (list): [w, h] output heatmap's size
        sigma (float): the std of gaussin kernel genereted
        records(dict): the dict contained the image and coords

    Returns:
        records (dict): contain the heatmaps used to heatmaploss
    """

    def __init__(self, hmsize, sigma):
        super(ToHeatmapsTopDown_UDP, self).__init__()
        self.hmsize = np.array(hmsize)
        self.sigma = sigma

    def __call__(self, records):
        joints = records['gt_joints']
        joints_vis = records['joints_vis']
        num_joints = joints.shape[0]
        image_size = np.array(
            [records['image'].shape[1], records['image'].shape[0]])
        target_weight = np.ones((num_joints, 1), dtype=np.float32)
        target_weight[:, 0] = joints_vis[:, 0]
        target = np.zeros(
            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
        tmp_size = self.sigma * 3
        size = 2 * tmp_size + 1
        x = np.arange(0, size, 1, np.float32)
        y = x[:, None]
        feat_stride = (image_size - 1.0) / (self.hmsize - 1.0)
        for joint_id in range(num_joints):
            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
            # Check that any part of the gaussian is in-bounds
            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
                    0] < 0 or br[1] < 0:
                # If not, just return the image as is
                target_weight[joint_id] = 0
                continue

            mu_x_ac = joints[joint_id][0] / feat_stride[0]
            mu_y_ac = joints[joint_id][1] / feat_stride[1]
            x0 = y0 = size // 2
            x0 += mu_x_ac - mu_x
            y0 += mu_y_ac - mu_y
            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
            # Usable gaussian range
            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
            # Image range
            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])

            v = target_weight[joint_id]
            if v > 0.5:
                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
                    0]:g_y[1], g_x[0]:g_x[1]]
        records['target'] = target
        records['target_weight'] = target_weight
        del records['gt_joints'], records['joints_vis']

        return records


from typing import Optional, Tuple, Union, List
import numbers


def _scale_size(
        size: Tuple[int, int],
        scale: Union[float, int, tuple], ) -> Tuple[int, int]:
    """Rescale a size by a ratio.

    Args:
        size (tuple[int]): (w, h).
        scale (float | tuple(float)): Scaling factor.

    Returns:
        tuple[int]: scaled size.
    """
    if isinstance(scale, (float, int)):
        scale = (scale, scale)
    w, h = size
    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)


def rescale_size(old_size: tuple,
                 scale: Union[float, int, tuple],
                 return_scale: bool=False) -> tuple:
    """Calculate the new size to be rescaled to.

    Args:
        old_size (tuple[int]): The old size (w, h) of image.
        scale (float | tuple[int]): The scaling factor or maximum size.
            If it is a float number, then the image will be rescaled by this
            factor, else if it is a tuple of 2 integers, then the image will
            be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image size.

    Returns:
        tuple[int]: The new rescaled image size.
    """
    w, h = old_size
    if isinstance(scale, (float, int)):
        if scale <= 0:
            raise ValueError(f'Invalid scale {scale}, must be positive.')
        scale_factor = scale
    elif isinstance(scale, list):
        max_long_edge = max(scale)
        max_short_edge = min(scale)
        scale_factor = min(max_long_edge / max(h, w),
                           max_short_edge / min(h, w))
    else:
        raise TypeError(
            f'Scale must be a number or tuple of int, but got {type(scale)}')

    new_size = _scale_size((w, h), scale_factor)

    if return_scale:
        return new_size, scale_factor
    else:
        return new_size


def imrescale(img: np.ndarray,
              scale: Union[float, Tuple[int, int]],
              return_scale: bool=False,
              interpolation: str='bilinear',
              backend: Optional[str]=None) -> Union[np.ndarray, Tuple[
                  np.ndarray, float]]:
    """Resize image while keeping the aspect ratio.

    Args:
        img (ndarray): The input image.
        scale (float | tuple[int]): The scaling factor or maximum size.
            If it is a float number, then the image will be rescaled by this
            factor, else if it is a tuple of 2 integers, then the image will
            be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image.
        interpolation (str): Same as :func:`resize`.
        backend (str | None): Same as :func:`resize`.

    Returns:
        ndarray: The rescaled image.
    """
    h, w = img.shape[:2]
    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
    rescaled_img = imresize(
        img, new_size, interpolation=interpolation, backend=backend)
    if return_scale:
        return rescaled_img, scale_factor
    else:
        return rescaled_img


def imresize(
        img: np.ndarray,
        size: Tuple[int, int],
        return_scale: bool=False,
        interpolation: str='bilinear',
        out: Optional[np.ndarray]=None,
        backend: Optional[str]=None,
        interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],
                                            np.ndarray]:
    """Resize image to a given size.

    Args:
        img (ndarray): The input image.
        size (tuple[int]): Target size (w, h).
        return_scale (bool): Whether to return `w_scale` and `h_scale`.
        interpolation (str): Interpolation method, accepted values are
            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
            backend, "nearest", "bilinear" for 'pillow' backend.
        out (ndarray): The output destination.
        backend (str | None): The image resize backend type. Options are `cv2`,
            `pillow`, `None`. If backend is None, the global imread_backend
            specified by ``mmcv.use_backend()`` will be used. Default: None.

    Returns:
        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
        `resized_img`.
    """
    h, w = img.shape[:2]
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported for resize.'
                         f"Supported backends are 'cv2', 'pillow'")

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        pil_image = Image.fromarray(img)
        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
        resized_img = np.array(pil_image)
    else:
        resized_img = cv2.resize(img, size, dst=out, interpolation=interp)
    if not return_scale:
        return resized_img
    else:
        w_scale = size[0] / w
        h_scale = size[1] / h
        return resized_img, w_scale, h_scale


class PETR_Resize:
    """Resize images & bbox & mask.

    This transform resizes the input image to some scale. Bboxes and masks are
    then resized with the same scale factor. If the input dict contains the key
    "scale", then the scale in the input dict is used, otherwise the specified
    scale in the init method is used. If the input dict contains the key
    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
    scale_factor), the actual scale will be computed by image shape and
    scale_factor.

    `img_scale` can either be a tuple (single-scale) or a list of tuple
    (multi-scale). There are 3 multiscale modes:

    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
      range and multiply it with the image scale.
    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
      sample a scale from the multiscale range.
    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
      sample a scale from multiple scales.

    Args:
        img_scale (tuple or list[tuple]): Images scales for resizing.
        multiscale_mode (str): Either "range" or "value".
        ratio_range (tuple[float]): (min_ratio, max_ratio)
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image.
        bbox_clip_border (bool, optional): Whether to clip the objects outside
            the border of the image. In some dataset like MOT17, the gt bboxes
            are allowed to cross the border of images. Therefore, we don't
            need to clip the gt bboxes in these cases. Defaults to True.
        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
            These two backends generates slightly different results. Defaults
            to 'cv2'.
        interpolation (str): Interpolation method, accepted values are
            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
            backend, "nearest", "bilinear" for 'pillow' backend.
        override (bool, optional): Whether to override `scale` and
            `scale_factor` so as to call resize twice. Default False. If True,
            after the first resizing, the existed `scale` and `scale_factor`
            will be ignored so the second resizing can be allowed.
            This option is a work-around for multiple times of resize in DETR.
            Defaults to False.
    """

    def __init__(self,
                 img_scale=None,
                 multiscale_mode='range',
                 ratio_range=None,
                 keep_ratio=True,
                 bbox_clip_border=True,
                 backend='cv2',
                 interpolation='bilinear',
                 override=False,
                 keypoint_clip_border=True):
        if img_scale is None:
            self.img_scale = None
        else:
            if isinstance(img_scale, list):
                self.img_scale = img_scale
            else:
                self.img_scale = [img_scale]
            assert isinstance(self.img_scale, list)

        if ratio_range is not None:
            # mode 1: given a scale and a range of image ratio
            assert len(self.img_scale) == 1
        else:
            # mode 2: given multiple scales or a range of scales
            assert multiscale_mode in ['value', 'range']

        self.backend = backend
        self.multiscale_mode = multiscale_mode
        self.ratio_range = ratio_range
        self.keep_ratio = keep_ratio
        # TODO: refactor the override option in Resize
        self.interpolation = interpolation
        self.override = override
        self.bbox_clip_border = bbox_clip_border
        self.keypoint_clip_border = keypoint_clip_border

    @staticmethod
    def random_select(img_scales):
        """Randomly select an img_scale from given candidates.

        Args:
            img_scales (list[tuple]): Images scales for selection.

        Returns:
            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
                where ``img_scale`` is the selected image scale and \
                ``scale_idx`` is the selected index in the given candidates.
        """

        assert isinstance(img_scales, list)
        scale_idx = np.random.randint(len(img_scales))
        img_scale = img_scales[scale_idx]
        return img_scale, scale_idx

    @staticmethod
    def random_sample(img_scales):
        """Randomly sample an img_scale when ``multiscale_mode=='range'``.

        Args:
            img_scales (list[tuple]): Images scale range for sampling.
                There must be two tuples in img_scales, which specify the lower
                and upper bound of image scales.

        Returns:
            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
                ``img_scale`` is sampled scale and None is just a placeholder \
                to be consistent with :func:`random_select`.
        """

        assert isinstance(img_scales, list) and len(img_scales) == 2
        img_scale_long = [max(s) for s in img_scales]
        img_scale_short = [min(s) for s in img_scales]
        long_edge = np.random.randint(
            min(img_scale_long), max(img_scale_long) + 1)
        short_edge = np.random.randint(
            min(img_scale_short), max(img_scale_short) + 1)
        img_scale = (long_edge, short_edge)
        return img_scale, None

    @staticmethod
    def random_sample_ratio(img_scale, ratio_range):
        """Randomly sample an img_scale when ``ratio_range`` is specified.

        A ratio will be randomly sampled from the range specified by
        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
        generate sampled scale.

        Args:
            img_scale (list): Images scale base to multiply with ratio.
            ratio_range (tuple[float]): The minimum and maximum ratio to scale
                the ``img_scale``.

        Returns:
            (tuple, None): Returns a tuple ``(scale, None)``, where \
                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
                None is just a placeholder to be consistent with \
                :func:`random_select`.
        """

        assert isinstance(img_scale, list) and len(img_scale) == 2
        min_ratio, max_ratio = ratio_range
        assert min_ratio <= max_ratio
        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
        return scale, None

    def _random_scale(self, results):
        """Randomly sample an img_scale according to ``ratio_range`` and
        ``multiscale_mode``.

        If ``ratio_range`` is specified, a ratio will be sampled and be
        multiplied with ``img_scale``.
        If multiple scales are specified by ``img_scale``, a scale will be
        sampled according to ``multiscale_mode``.
        Otherwise, single scale will be used.

        Args:
            results (dict): Result dict from :obj:`dataset`.

        Returns:
            dict: Two new keys 'scale` and 'scale_idx` are added into \
                ``results``, which would be used by subsequent pipelines.
        """

        if self.ratio_range is not None:
            scale, scale_idx = self.random_sample_ratio(self.img_scale[0],
                                                        self.ratio_range)
        elif len(self.img_scale) == 1:
            scale, scale_idx = self.img_scale[0], 0
        elif self.multiscale_mode == 'range':
            scale, scale_idx = self.random_sample(self.img_scale)
        elif self.multiscale_mode == 'value':
            scale, scale_idx = self.random_select(self.img_scale)
        else:
            raise NotImplementedError
        results['scale'] = scale
        results['scale_idx'] = scale_idx

    def _resize_img(self, results):
        """Resize images with ``results['scale']``."""
        for key in ['image'] if 'image' in results else []:
            if self.keep_ratio:
                img, scale_factor = imrescale(
                    results[key],
                    results['scale'],
                    return_scale=True,
                    interpolation=self.interpolation,
                    backend=self.backend)
                # the w_scale and h_scale has minor difference
                # a real fix should be done in the imrescale in the future
                new_h, new_w = img.shape[:2]
                h, w = results[key].shape[:2]
                w_scale = new_w / w
                h_scale = new_h / h
            else:
                img, w_scale, h_scale = imresize(
                    results[key],
                    results['scale'],
                    return_scale=True,
                    interpolation=self.interpolation,
                    backend=self.backend)

            scale_factor = np.array(
                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
            results['im_shape'] = np.array(img.shape)
            # in case that there is no padding
            results['pad_shape'] = img.shape
            results['scale_factor'] = scale_factor
            results['keep_ratio'] = self.keep_ratio
            # img_pad = self.impad(img, shape=results['scale'])
            results[key] = img

    def _resize_bboxes(self, results):
        """Resize bounding boxes with ``results['scale_factor']``."""
        for key in ['gt_bbox'] if 'gt_bbox' in results else []:
            bboxes = results[key] * results['scale_factor']
            if self.bbox_clip_border:
                img_shape = results['im_shape']
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            results[key] = bboxes

    def _resize_masks(self, results):
        """Resize masks with ``results['scale']``"""
        for key in ['mask'] if 'mask' in results else []:
            if results[key] is None:
                continue
            if self.keep_ratio:
                results[key] = results[key].rescale(results['scale'])
            else:
                results[key] = results[key].resize(results['im_shape'][:2])

    def _resize_seg(self, results):
        """Resize semantic segmentation map with ``results['scale']``."""
        for key in ['seg'] if 'seg' in results else []:
            if self.keep_ratio:
                gt_seg = imrescale(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            else:
                gt_seg = imresize(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            results[key] = gt_seg

    def _resize_keypoints(self, results):
        """Resize keypoints with ``results['scale_factor']``."""
        for key in ['gt_joints'] if 'gt_joints' in results else []:
            keypoints = results[key].copy()
            keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]
            keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]
            if self.keypoint_clip_border:
                img_shape = results['im_shape']
                keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])
                keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])
            results[key] = keypoints

    def _resize_areas(self, results):
        """Resize mask areas with ``results['scale_factor']``."""
        for key in ['gt_areas'] if 'gt_areas' in results else []:
            areas = results[key].copy()
            areas = areas * results['scale_factor'][0] * results[
                'scale_factor'][1]
            results[key] = areas

    def __call__(self, results):
        """Call function to resize images, bounding boxes, masks, semantic
        segmentation map.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \
                'keep_ratio' keys are added into result dict.
        """
        if 'scale' not in results:
            if 'scale_factor' in results:
                img_shape = results['image'].shape[:2]
                scale_factor = results['scale_factor'][0]
                # assert isinstance(scale_factor, float)
                results['scale'] = [int(x * scale_factor)
                                    for x in img_shape][::-1]
            else:
                self._random_scale(results)
        else:
            if not self.override:
                assert 'scale_factor' not in results, (
                    'scale and scale_factor cannot be both set.')
            else:
                results.pop('scale')
                if 'scale_factor' in results:
                    results.pop('scale_factor')
                self._random_scale(results)

        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_masks(results)
        self._resize_seg(results)
        self._resize_keypoints(results)
        self._resize_areas(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(img_scale={self.img_scale}, '
        repr_str += f'multiscale_mode={self.multiscale_mode}, '
        repr_str += f'ratio_range={self.ratio_range}, '
        repr_str += f'keep_ratio={self.keep_ratio}, '
        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
        repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'
        return repr_str


================================================
FILE: ppdet/data/transform/keypoints_3d_operators.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
import cv2
import numpy as np
import math
import copy
import random
import uuid
from numbers import Number, Integral

from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
from ppdet.core.workspace import serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

registered_ops = []

__all__ = [
    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
]

import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from mpl_toolkits.mplot3d import Axes3D


def register_keypointop(cls):
    return serializable(cls)


def register_op(cls):
    registered_ops.append(cls.__name__)
    if not hasattr(BaseOperator, cls.__name__):
        setattr(BaseOperator, cls.__name__, cls)
    else:
        raise KeyError("The {} class has been registered.".format(cls.__name__))
    return serializable(cls)


class BaseOperator(object):
    def __init__(self, name=None):
        if name is None:
            name = self.__class__.__name__
        self._id = name + '_' + str(uuid.uuid4())[-6:]

    def apply(self, sample, context=None):
        """ Process a sample.
        Args:
            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
            context (dict): info about this sample processing
        Returns:
            result (dict): a processed sample
        """
        return sample

    def __call__(self, sample, context=None):
        """ Process a sample.
        Args:
            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
            context (dict): info about this sample processing
        Returns:
            result (dict): a processed sample
        """
        if isinstance(sample, Sequence):  # for batch_size
            for i in range(len(sample)):
                sample[i] = self.apply(sample[i], context)
        else:
            # image.shape changed
            sample = self.apply(sample, context)
        return sample

    def __str__(self):
        return str(self._id)


@register_keypointop
class CropAndFlipImages(object):
    """Crop all images"""

    def __init__(self, crop_range, flip_pairs=None):
        super(CropAndFlipImages, self).__init__()
        self.crop_range = crop_range
        self.flip_pairs = flip_pairs

    def __call__(self, records):  # tuple
        images = records["image"]
        images = images[:, :, ::-1, :]
        images = images[:, :, self.crop_range[0]:self.crop_range[1]]
        records["image"] = images

        if "kps2d" in records.keys():
            kps2d = records["kps2d"]

            width, height = images.shape[2], images.shape[1]
            kps2d = np.array(kps2d)
            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]

            for pair in self.flip_pairs:
                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()

            records["kps2d"] = kps2d

        return records


@register_op
class PermuteImages(BaseOperator):
    def __init__(self):
        """
        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
        """
        super(PermuteImages, self).__init__()

    def apply(self, sample, context=None):
        images = sample["image"]
        images = images.transpose((0, 3, 1, 2))

        sample["image"] = images

        return sample


@register_keypointop
class RandomFlipHalfBody3DTransformImages(object):
    """apply data augment to images and coords
    to achieve the flip, scale, rotate and half body transform effect for training image
    Args:
        trainsize (list):[w, h], Image target size
        upper_body_ids (list): The upper body joint ids
        flip_pairs (list): The left-right joints exchange order list
        pixel_std (int): The pixel std of the scale
        scale (float): The scale factor to transform the image
        rot (int): The rotate factor to transform the image
        num_joints_half_body (int): The joints threshold of the half body transform
        prob_half_body (float): The threshold of the half body transform
        flip (bool): Whether to flip the image
    Returns:
        records(dict): contain the image and coords after tranformed
    """

    def __init__(self,
                 trainsize,
                 upper_body_ids,
                 flip_pairs,
                 pixel_std,
                 scale=0.35,
                 rot=40,
                 num_joints_half_body=8,
                 prob_half_body=0.3,
                 flip=True,
                 rot_prob=0.6,
                 do_occlusion=False):
        super(RandomFlipHalfBody3DTransformImages, self).__init__()
        self.trainsize = trainsize
        self.upper_body_ids = upper_body_ids
        self.flip_pairs = flip_pairs
        self.pixel_std = pixel_std
        self.scale = scale
        self.rot = rot
        self.num_joints_half_body = num_joints_half_body
        self.prob_half_body = prob_half_body
        self.flip = flip
        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
        self.rot_prob = rot_prob
        self.do_occlusion = do_occlusion

    def halfbody_transform(self, joints, joints_vis):
        upper_joints = []
        lower_joints = []
        for joint_id in range(joints.shape[0]):
            if joints_vis[joint_id][0] > 0:
                if joint_id in self.upper_body_ids:
                    upper_joints.append(joints[joint_id])
                else:
                    lower_joints.append(joints[joint_id])
        if np.random.randn() < 0.5 and len(upper_joints) > 2:
            selected_joints = upper_joints
        else:
            selected_joints = lower_joints if len(
                lower_joints) > 2 else upper_joints
        if len(selected_joints) < 2:
            return None, None
        selected_joints = np.array(selected_joints, dtype=np.float32)
        center = selected_joints.mean(axis=0)[:2]
        left_top = np.amin(selected_joints, axis=0)
        right_bottom = np.amax(selected_joints, axis=0)
        w = right_bottom[0] - left_top[0]
        h = right_bottom[1] - left_top[1]
        if w > self.aspect_ratio * h:
            h = w * 1.0 / self.aspect_ratio
        elif w < self.aspect_ratio * h:
            w = h * self.aspect_ratio
        scale = np.array(
            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
            dtype=np.float32)
        scale = scale * 1.5

        return center, scale

    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
        # joints: (6, 24, 3),(num_frames, num_joints, 3)

        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x
        if kps2d is not None:
            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1

        for pair in matched_parts:
            joints[:, pair[0], :], joints[:,pair[1], :] = \
                joints[:,pair[1], :], joints[:,pair[0], :].copy()

            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()

            if kps2d is not None:
                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()

        # move to zero
        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray

        return joints, joints_vis, kps2d

    def __call__(self, records):
        images = records[
            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)

        joints = records['kps3d']
        joints_vis = records['kps3d_vis']

        kps2d = None
        if 'kps2d' in records.keys():
            kps2d = records['kps2d']

        if self.flip and np.random.random() <= 0.5:
            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)
            joints, joints_vis, kps2d = self.flip_joints(
                joints, joints_vis, images.shape[2], self.flip_pairs,
                kps2d)  # 关键点左右对称翻转
        occlusion = False
        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡
            height = images[0].shape[0]
            width = images[0].shape[1]
            occlusion = True
            while True:
                area_min = 0.0
                area_max = 0.2
                synth_area = (random.random() *
                              (area_max - area_min) + area_min) * width * height

                ratio_min = 0.3
                ratio_max = 1 / 0.3
                synth_ratio = (random.random() *
                               (ratio_max - ratio_min) + ratio_min)

                synth_h = math.sqrt(synth_area * synth_ratio)
                synth_w = math.sqrt(synth_area / synth_ratio)
                synth_xmin = random.random() * (width - synth_w - 1)
                synth_ymin = random.random() * (height - synth_h - 1)

                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
                    xmin = int(synth_xmin)
                    ymin = int(synth_ymin)
                    w = int(synth_w)
                    h = int(synth_h)

                    mask = np.random.rand(h, w, 3) * 255
                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
                        None, :, :, :]
                    break

        records['image'] = images
        records['kps3d'] = joints
        records['kps3d_vis'] = joints_vis
        if kps2d is not None:
            records['kps2d'] = kps2d

        return records


================================================
FILE: ppdet/data/transform/mot_operators.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
from numbers import Integral

import cv2
import copy
import numpy as np
import random
import math

from .operators import BaseOperator, register_op
from .batch_operators import Gt2TTFTarget
from ppdet.modeling.bbox_utils import bbox_iou_np_expand
from ppdet.utils.logger import setup_logger
from .op_helper import gaussian_radius
logger = setup_logger(__name__)

__all__ = [
    'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',
    'Gt2JDETargetMax', 'Gt2FairMOTTarget'
]


@register_op
class RGBReverse(BaseOperator):
    """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
    """

    def __init__(self):
        super(RGBReverse, self).__init__()

    def apply(self, sample, context=None):
        im = sample['image']
        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
        return sample


@register_op
class LetterBoxResize(BaseOperator):
    def __init__(self, target_size):
        """
        Resize image to target size, convert normalized xywh to pixel xyxy
        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
        Args:
            target_size (int|list): image target size.
        """
        super(LetterBoxResize, self).__init__()
        if not isinstance(target_size, (Integral, Sequence)):
            raise TypeError(
                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
                format(type(target_size)))
        if isinstance(target_size, Integral):
            target_size = [target_size, target_size]
        self.target_size = target_size

    def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):
        # letterbox: resize a rectangular image to a padded rectangular
        shape = img.shape[:2]  # [height, width]
        ratio_h = float(height) / shape[0]
        ratio_w = float(width) / shape[1]
        ratio = min(ratio_h, ratio_w)
        new_shape = (round(shape[1] * ratio),
                     round(shape[0] * ratio))  # [width, height]
        padw = (width - new_shape[0]) / 2
        padh = (height - new_shape[1]) / 2
        top, bottom = round(padh - 0.1), round(padh + 0.1)
        left, right = round(padw - 0.1), round(padw + 0.1)

        img = cv2.resize(
            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
        img = cv2.copyMakeBorder(
            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
            value=color)  # padded rectangular
        return img, ratio, padw, padh

    def apply_bbox(self, bbox0, h, w, ratio, padw, padh):
        bboxes = bbox0.copy()
        bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw
        bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh
        bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw
        bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh
        return bboxes

    def apply(self, sample, context=None):
        """ Resize the image numpy.
        """
        im = sample['image']
        h, w = sample['im_shape']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))
        if len(im.shape) != 3:
            from PIL import UnidentifiedImageError
            raise UnidentifiedImageError(
                '{}: image is not 3-dimensional.'.format(self))

        # apply image
        height, width = self.target_size
        img, ratio, padw, padh = self.apply_image(
            im, height=height, width=width)

        sample['image'] = img
        new_shape = (round(h * ratio), round(w * ratio))
        sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)
        sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)

        # apply bbox
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,
                                                padw, padh)
        return sample


@register_op
class MOTRandomAffine(BaseOperator):
    """ 
    Affine transform to image and coords to achieve the rotate, scale and
    shift effect for training image.

    Args:
        degrees (list[2]): the rotate range to apply, transform range is [min, max]
        translate (list[2]): the translate range to apply, transform range is [min, max]
        scale (list[2]): the scale range to apply, transform range is [min, max]
        shear (list[2]): the shear range to apply, transform range is [min, max]
        borderValue (list[3]): value used in case of a constant border when appling
            the perspective transformation
        reject_outside (bool): reject warped bounding bboxes outside of image

    Returns:
        records(dict): contain the image and coords after tranformed

    """

    def __init__(self,
                 degrees=(-5, 5),
                 translate=(0.10, 0.10),
                 scale=(0.50, 1.20),
                 shear=(-2, 2),
                 borderValue=(127.5, 127.5, 127.5),
                 reject_outside=True):
        super(MOTRandomAffine, self).__init__()
        self.degrees = degrees
        self.translate = translate
        self.scale = scale
        self.shear = shear
        self.borderValue = borderValue
        self.reject_outside = reject_outside

    def apply(self, sample, context=None):
        # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
        border = 0  # width of added border (optional)

        img = sample['image']
        height, width = img.shape[0], img.shape[1]

        # Rotation and Scale
        R = np.eye(3)
        a = random.random() * (self.degrees[1] - self.degrees[0]
                               ) + self.degrees[0]
        s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
        R[:2] = cv2.getRotationMatrix2D(
            angle=a, center=(width / 2, height / 2), scale=s)

        # Translation
        T = np.eye(3)
        T[0, 2] = (
            random.random() * 2 - 1
        ) * self.translate[0] * height + border  # x translation (pixels)
        T[1, 2] = (
            random.random() * 2 - 1
        ) * self.translate[1] * width + border  # y translation (pixels)

        # Shear
        S = np.eye(3)
        S[0, 1] = math.tan((random.random() *
                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
                           math.pi / 180)  # x shear (deg)
        S[1, 0] = math.tan((random.random() *
                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
                           math.pi / 180)  # y shear (deg)

        M = S @T @R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
        imw = cv2.warpPerspective(
            img,
            M,
            dsize=(width, height),
            flags=cv2.INTER_LINEAR,
            borderValue=self.borderValue)  # BGR order borderValue

        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            targets = sample['gt_bbox']
            n = targets.shape[0]
            points = targets.copy()
            area0 = (points[:, 2] - points[:, 0]) * (
                points[:, 3] - points[:, 1])

            # warp points
            xy = np.ones((n * 4, 3))
            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
            xy = (xy @M.T)[:, :2].reshape(n, 8)

            # create new boxes
            x = xy[:, [0, 2, 4, 6]]
            y = xy[:, [1, 3, 5, 7]]
            xy = np.concatenate(
                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T

            # apply angle-based reduction
            radians = a * math.pi / 180
            reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5
            x = (xy[:, 2] + xy[:, 0]) / 2
            y = (xy[:, 3] + xy[:, 1]) / 2
            w = (xy[:, 2] - xy[:, 0]) * reduction
            h = (xy[:, 3] - xy[:, 1]) * reduction
            xy = np.concatenate(
                (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T

            # reject warped points outside of image
            if self.reject_outside:
                np.clip(xy[:, 0], 0, width, out=xy[:, 0])
                np.clip(xy[:, 2], 0, width, out=xy[:, 2])
                np.clip(xy[:, 1], 0, height, out=xy[:, 1])
                np.clip(xy[:, 3], 0, height, out=xy[:, 3])
            w = xy[:, 2] - xy[:, 0]
            h = xy[:, 3] - xy[:, 1]
            area = w * h
            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)

            if sum(i) > 0:
                sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)
                sample['gt_class'] = sample['gt_class'][i]
                if 'difficult' in sample:
                    sample['difficult'] = sample['difficult'][i]
                if 'gt_ide' in sample:
                    sample['gt_ide'] = sample['gt_ide'][i]
                if 'is_crowd' in sample:
                    sample['is_crowd'] = sample['is_crowd'][i]
                sample['image'] = imw
                return sample
            else:
                return sample


@register_op
class Gt2JDETargetThres(BaseOperator):
    __shared__ = ['num_classes']
    """
    Generate JDE targets by groud truth data when training
    Args:
        anchors (list): anchors of JDE model
        anchor_masks (list): anchor_masks of JDE model
        downsample_ratios (list): downsample ratios of JDE model
        ide_thresh (float): thresh of identity, higher is groud truth 
        fg_thresh (float): thresh of foreground, higher is foreground
        bg_thresh (float): thresh of background, lower is background
        num_classes (int): number of classes
    """

    def __init__(self,
                 anchors,
                 anchor_masks,
                 downsample_ratios,
                 ide_thresh=0.5,
                 fg_thresh=0.5,
                 bg_thresh=0.4,
                 num_classes=1):
        super(Gt2JDETargetThres, self).__init__()
        self.anchors = anchors
        self.anchor_masks = anchor_masks
        self.downsample_ratios = downsample_ratios
        self.ide_thresh = ide_thresh
        self.fg_thresh = fg_thresh
        self.bg_thresh = bg_thresh
        self.num_classes = num_classes

    def generate_anchor(self, nGh, nGw, anchor_hw):
        nA = len(anchor_hw)
        yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))

        mesh = np.stack([xx.T, yy.T], axis=0)  # [2, nGh, nGw]
        mesh = np.repeat(mesh[None, :], nA, axis=0)  # [nA, 2, nGh, nGw]

        anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]
        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)
        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)

        anchor_mesh = np.concatenate(
            [mesh, anchor_offset_mesh], axis=1)  # [nA, 4, nGh, nGw]
        return anchor_mesh

    def encode_delta(self, gt_box_list, fg_anchor_list):
        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
        gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
                        gt_box_list[:, 2], gt_box_list[:, 3]
        dx = (gx - px) / pw
        dy = (gy - py) / ph
        dw = np.log(gw / pw)
        dh = np.log(gh / ph)
        return np.stack([dx, dy, dw, dh], axis=1)

    def pad_box(self, sample, num_max):
        assert 'gt_bbox' in sample
        bbox = sample['gt_bbox']
        gt_num = len(bbox)
        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
        if gt_num > 0:
            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
        sample['gt_bbox'] = pad_bbox
        if 'gt_score' in sample:
            pad_score = np.zeros((num_max, ), dtype=np.float32)
            if gt_num > 0:
                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
            sample['gt_score'] = pad_score
        if 'difficult' in sample:
            pad_diff = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
            sample['difficult'] = pad_diff
        if 'is_crowd' in sample:
            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
            sample['is_crowd'] = pad_crowd
        if 'gt_ide' in sample:
            pad_ide = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
            sample['gt_ide'] = pad_ide
        return sample

    def __call__(self, samples, context=None):
        assert len(self.anchor_masks) == len(self.downsample_ratios), \
            "anchor_masks', and 'downsample_ratios' should have same length."
        h, w = samples[0]['image'].shape[1:3]

        num_max = 0
        for sample in samples:
            num_max = max(num_max, len(sample['gt_bbox']))

        for sample in samples:
            gt_bbox = sample['gt_bbox']
            gt_ide = sample['gt_ide']
            for i, (anchor_hw, downsample_ratio
                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
                anchor_hw = np.array(
                    anchor_hw, dtype=np.float32) / downsample_ratio
                nA = len(anchor_hw)
                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)

                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
                gxy[:, 0] = gxy[:, 0] * nGw
                gxy[:, 1] = gxy[:, 1] * nGh
                gwh[:, 0] = gwh[:, 0] * nGw
                gwh[:, 1] = gwh[:, 1] * nGh
                gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)
                gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)
                tboxes = np.concatenate([gxy, gwh], axis=1)

                anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)

                anchor_list = np.transpose(anchor_mesh,
                                           (0, 2, 3, 1)).reshape(-1, 4)
                iou_pdist = bbox_iou_np_expand(
                    anchor_list, tboxes, x1y1x2y2=False)

                iou_max = np.max(iou_pdist, axis=1)
                max_gt_index = np.argmax(iou_pdist, axis=1)

                iou_map = iou_max.reshape(nA, nGh, nGw)
                gt_index_map = max_gt_index.reshape(nA, nGh, nGw)

                id_index = iou_map > self.ide_thresh
                fg_index = iou_map > self.fg_thresh
                bg_index = iou_map < self.bg_thresh
                ign_index = (iou_map < self.fg_thresh) * (
                    iou_map > self.bg_thresh)
                tconf[fg_index] = 1
                tconf[bg_index] = 0
                tconf[ign_index] = -1

                gt_index = gt_index_map[fg_index]
                gt_box_list = tboxes[gt_index]
                gt_id_list = gt_ide[gt_index_map[id_index]]

                if np.sum(fg_index) > 0:
                    tid[id_index] = gt_id_list

                    fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,
                                                         4)[fg_index]
                    delta_target = self.encode_delta(gt_box_list,
                                                     fg_anchor_list)
                    tbox[fg_index] = delta_target

                sample['tbox{}'.format(i)] = tbox
                sample['tconf{}'.format(i)] = tconf
                sample['tide{}'.format(i)] = tid
            sample.pop('gt_class')
            sample = self.pad_box(sample, num_max)
        return samples


@register_op
class Gt2JDETargetMax(BaseOperator):
    __shared__ = ['num_classes']
    """
    Generate JDE targets by groud truth data when evaluating
    Args:
        anchors (list): anchors of JDE model
        anchor_masks (list): anchor_masks of JDE model
        downsample_ratios (list): downsample ratios of JDE model
        max_iou_thresh (float): iou thresh for high quality anchor
        num_classes (int): number of classes
    """

    def __init__(self,
                 anchors,
                 anchor_masks,
                 downsample_ratios,
                 max_iou_thresh=0.60,
                 num_classes=1):
        super(Gt2JDETargetMax, self).__init__()
        self.anchors = anchors
        self.anchor_masks = anchor_masks
        self.downsample_ratios = downsample_ratios
        self.max_iou_thresh = max_iou_thresh
        self.num_classes = num_classes

    def __call__(self, samples, context=None):
        assert len(self.anchor_masks) == len(self.downsample_ratios), \
            "anchor_masks', and 'downsample_ratios' should have same length."
        h, w = samples[0]['image'].shape[1:3]
        for sample in samples:
            gt_bbox = sample['gt_bbox']
            gt_ide = sample['gt_ide']
            for i, (anchor_hw, downsample_ratio
                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
                anchor_hw = np.array(
                    anchor_hw, dtype=np.float32) / downsample_ratio
                nA = len(anchor_hw)
                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)

                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
                gxy[:, 0] = gxy[:, 0] * nGw
                gxy[:, 1] = gxy[:, 1] * nGh
                gwh[:, 0] = gwh[:, 0] * nGw
                gwh[:, 1] = gwh[:, 1] * nGh
                gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)
                gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)

                # iou of targets-anchors (using wh only)
                box1 = gwh
                box2 = anchor_hw[:, None, :]
                inter_area = np.minimum(box1, box2).prod(2)
                iou = inter_area / (
                    box1.prod(1) + box2.prod(2) - inter_area + 1e-16)

                # Select best iou_pred and anchor
                iou_best = iou.max(0)  # best anchor [0-2] for each target
                a = np.argmax(iou, axis=0)

                # Select best unique target-anchor combinations
                iou_order = np.argsort(-iou_best)  # best to worst

                # Unique anchor selection
                u = np.stack((gi, gj, a), 0)[:, iou_order]
                _, first_unique = np.unique(u, axis=1, return_index=True)
                mask = iou_order[first_unique]
                # best anchor must share significant commonality (iou) with target
                # TODO: examine arbitrary threshold
                idx = mask[iou_best[mask] > self.max_iou_thresh]

                if len(idx) > 0:
                    a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]
                    t_box = gt_bbox[idx]
                    t_id = gt_ide[idx]
                    if len(t_box.shape) == 1:
                        t_box = t_box.reshape(1, 4)

                    gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()
                    gxy[:, 0] = gxy[:, 0] * nGw
                    gxy[:, 1] = gxy[:, 1] * nGh
                    gwh[:, 0] = gwh[:, 0] * nGw
                    gwh[:, 1] = gwh[:, 1] * nGh

                    # XY coordinates
                    tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)
                    # Width and height in yolo method
                    tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh /
                                                                 anchor_hw[a_i])
                    tconf[a_i, gj_i, gi_i] = 1
                    tid[a_i, gj_i, gi_i] = t_id

                sample['tbox{}'.format(i)] = tbox
                sample['tconf{}'.format(i)] = tconf
                sample['tide{}'.format(i)] = tid


class Gt2FairMOTTarget(Gt2TTFTarget):
    __shared__ = ['num_classes']
    """
    Generate FairMOT targets by ground truth data.
    Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
        1. the gaussian kernal radius to generate a heatmap.
        2. the targets needed during training.
    
    Args:
        num_classes(int): the number of classes.
        down_ratio(int): the down ratio from images to heatmap, 4 by default.
        max_objs(int): the maximum number of ground truth objects in a image, 500 by default.
    """

    def __init__(self, num_classes=1, down_ratio=4, max_objs=500):
        super(Gt2TTFTarget, self).__init__()
        self.down_ratio = down_ratio
        self.num_classes = num_classes
        self.max_objs = max_objs

    def __call__(self, samples, context=None):
        for b_id, sample in enumerate(samples):
            output_h = sample['image'].shape[1] // self.down_ratio
            output_w = sample['image'].shape[2] // self.down_ratio

            heatmap = np.zeros(
                (self.num_classes, output_h, output_w), dtype='float32')
            bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)
            center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)
            index = np.zeros((self.max_objs, ), dtype=np.int64)
            index_mask = np.zeros((self.max_objs, ), dtype=np.int32)
            reid = np.zeros((self.max_objs, ), dtype=np.int64)
            bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)
            if self.num_classes > 1:
                # each category corresponds to a set of track ids
                cls_tr_ids = np.zeros(
                    (self.num_classes, output_h, output_w), dtype=np.int64)
                cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)

            gt_bbox = sample['gt_bbox']
            gt_class = sample['gt_class']
            gt_ide = sample['gt_ide']

            for k in range(len(gt_bbox)):
                cls_id = gt_class[k][0]
                bbox = gt_bbox[k]
                ide = gt_ide[k][0]
                bbox[[0, 2]] = bbox[[0, 2]] * output_w
                bbox[[1, 3]] = bbox[[1, 3]] * output_h
                bbox_amodal = copy.deepcopy(bbox)
                bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.
                bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.
                bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]
                bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]
                bbox[0] = np.clip(bbox[0], 0, output_w - 1)
                bbox[1] = np.clip(bbox[1], 0, output_h - 1)
                h = bbox[3]
                w = bbox[2]

                bbox_xy = copy.deepcopy(bbox)
                bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2
                bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2
                bbox_xy[2] = bbox_xy[0] + bbox_xy[2]
                bbox_xy[3] = bbox_xy[1] + bbox_xy[3]

                if h > 0 and w > 0:
                    radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
                    radius = max(0, int(radius))
                    ct = np.array([bbox[0], bbox[1]], dtype=np.float32)
                    ct_int = ct.astype(np.int32)
                    self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius,
                                                radius)
                    bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \
                            bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]

                    index[k] = ct_int[1] * output_w + ct_int[0]
                    center_offset[k] = ct - ct_int
                    index_mask[k] = 1
                    reid[k] = ide
                    bbox_xys[k] = bbox_xy
                    if self.num_classes > 1:
                        cls_id_map[ct_int[1], ct_int[0]] = cls_id
                        cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1
                        # track id start from 0

            sample['heatmap'] = heatmap
            sample['index'] = index
            sample['offset'] = center_offset
            sample['size'] = bbox_size
            sample['index_mask'] = index_mask
            sample['reid'] = reid
            if self.num_classes > 1:
                sample['cls_id_map'] = cls_id_map
                sample['cls_tr_ids'] = cls_tr_ids
            sample['bbox_xys'] = bbox_xys
            sample.pop('is_crowd', None)
            sample.pop('difficult', None)
            sample.pop('gt_class', None)
            sample.pop('gt_bbox', None)
            sample.pop('gt_score', None)
            sample.pop('gt_ide', None)
        return samples


================================================
FILE: ppdet/data/transform/op_helper.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this file contains helper methods for BBOX processing

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import random
import math
import cv2


def meet_emit_constraint(src_bbox, sample_bbox):
    center_x = (src_bbox[2] + src_bbox[0]) / 2
    center_y = (src_bbox[3] + src_bbox[1]) / 2
    if center_x >= sample_bbox[0] and \
            center_x <= sample_bbox[2] and \
            center_y >= sample_bbox[1] and \
            center_y <= sample_bbox[3]:
        return True
    return False


def clip_bbox(src_bbox):
    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
    return src_bbox


def bbox_area(src_bbox):
    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
        return 0.
    else:
        width = src_bbox[2] - src_bbox[0]
        height = src_bbox[3] - src_bbox[1]
        return width * height


def is_overlap(object_bbox, sample_bbox):
    if object_bbox[0] >= sample_bbox[2] or \
       object_bbox[2] <= sample_bbox[0] or \
       object_bbox[1] >= sample_bbox[3] or \
       object_bbox[3] <= sample_bbox[1]:
        return False
    else:
        return True


def filter_and_process(sample_bbox, bboxes, labels, scores=None,
                       keypoints=None):
    new_bboxes = []
    new_labels = []
    new_scores = []
    new_keypoints = []
    new_kp_ignore = []
    for i in range(len(bboxes)):
        new_bbox = [0, 0, 0, 0]
        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
        if not meet_emit_constraint(obj_bbox, sample_bbox):
            continue
        if not is_overlap(obj_bbox, sample_bbox):
            continue
        sample_width = sample_bbox[2] - sample_bbox[0]
        sample_height = sample_bbox[3] - sample_bbox[1]
        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
        new_bbox = clip_bbox(new_bbox)
        if bbox_area(new_bbox) > 0:
            new_bboxes.append(new_bbox)
            new_labels.append([labels[i][0]])
            if scores is not None:
                new_scores.append([scores[i][0]])
            if keypoints is not None:
                sample_keypoint = keypoints[0][i]
                for j in range(len(sample_keypoint)):
                    kp_len = sample_height if j % 2 else sample_width
                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
                    sample_keypoint[j] = (
                        sample_keypoint[j] - sample_coord) / kp_len
                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
                new_keypoints.append(sample_keypoint)
                new_kp_ignore.append(keypoints[1][i])

    bboxes = np.array(new_bboxes)
    labels = np.array(new_labels)
    scores = np.array(new_scores)
    if keypoints is not None:
        keypoints = np.array(new_keypoints)
        new_kp_ignore = np.array(new_kp_ignore)
        return bboxes, labels, scores, (keypoints, new_kp_ignore)
    return bboxes, labels, scores


def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
    new_bboxes = []
    new_labels = []
    new_scores = []
    for i, bbox in enumerate(bboxes):
        w = float((bbox[2] - bbox[0]) * target_size)
        h = float((bbox[3] - bbox[1]) * target_size)
        if w * h < float(min_size * min_size):
            continue
        else:
            new_bboxes.append(bbox)
            new_labels.append(labels[i])
            if scores is not None and scores.size != 0:
                new_scores.append(scores[i])
    bboxes = np.array(new_bboxes)
    labels = np.array(new_labels)
    scores = np.array(new_scores)
    return bboxes, labels, scores


def generate_sample_bbox(sampler):
    scale = np.random.uniform(sampler[2], sampler[3])
    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
    aspect_ratio = max(aspect_ratio, (scale**2.0))
    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
    bbox_width = scale * (aspect_ratio**0.5)
    bbox_height = scale / (aspect_ratio**0.5)
    xmin_bound = 1 - bbox_width
    ymin_bound = 1 - bbox_height
    xmin = np.random.uniform(0, xmin_bound)
    ymin = np.random.uniform(0, ymin_bound)
    xmax = xmin + bbox_width
    ymax = ymin + bbox_height
    sampled_bbox = [xmin, ymin, xmax, ymax]
    return sampled_bbox


def generate_sample_bbox_square(sampler, image_width, image_height):
    scale = np.random.uniform(sampler[2], sampler[3])
    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
    aspect_ratio = max(aspect_ratio, (scale**2.0))
    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
    bbox_width = scale * (aspect_ratio**0.5)
    bbox_height = scale / (aspect_ratio**0.5)
    if image_height < image_width:
        bbox_width = bbox_height * image_height / image_width
    else:
        bbox_height = bbox_width * image_width / image_height
    xmin_bound = 1 - bbox_width
    ymin_bound = 1 - bbox_height
    xmin = np.random.uniform(0, xmin_bound)
    ymin = np.random.uniform(0, ymin_bound)
    xmax = xmin + bbox_width
    ymax = ymin + bbox_height
    sampled_bbox = [xmin, ymin, xmax, ymax]
    return sampled_bbox


def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
                         resize_width):
    num_gt = len(bbox_labels)
    # np.random.randint range: [low, high)
    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0

    if num_gt != 0:
        norm_xmin = bbox_labels[rand_idx][0]
        norm_ymin = bbox_labels[rand_idx][1]
        norm_xmax = bbox_labels[rand_idx][2]
        norm_ymax = bbox_labels[rand_idx][3]

        xmin = norm_xmin * image_width
        ymin = norm_ymin * image_height
        wid = image_width * (norm_xmax - norm_xmin)
        hei = image_height * (norm_ymax - norm_ymin)
        range_size = 0

        area = wid * hei
        for scale_ind in range(0, len(scale_array) - 1):
            if area > scale_array[scale_ind] ** 2 and area < \
                    scale_array[scale_ind + 1] ** 2:
                range_size = scale_ind + 1
                break

        if area > scale_array[len(scale_array) - 2]**2:
            range_size = len(scale_array) - 2

        scale_choose = 0.0
        if range_size == 0:
            rand_idx_size = 0
        else:
            # np.random.randint range: [low, high)
            rng_rand_size = np.random.randint(0, range_size + 1)
            rand_idx_size = rng_rand_size % (range_size + 1)

        if rand_idx_size == range_size:
            min_resize_val = scale_array[rand_idx_size] / 2.0
            max_resize_val = min(2.0 * scale_array[rand_idx_size],
                                 2 * math.sqrt(wid * hei))
            scale_choose = random.uniform(min_resize_val, max_resize_val)
        else:
            min_resize_val = scale_array[rand_idx_size] / 2.0
            max_resize_val = 2.0 * scale_array[rand_idx_size]
            scale_choose = random.uniform(min_resize_val, max_resize_val)

        sample_bbox_size = wid * resize_width / scale_choose

        w_off_orig = 0.0
        h_off_orig = 0.0
        if sample_bbox_size < max(image_height, image_width):
            if wid <= sample_bbox_size:
                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
                                               xmin)
            else:
                w_off_orig = np.random.uniform(xmin,
                                               xmin + wid - sample_bbox_size)

            if hei <= sample_bbox_size:
                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
                                               ymin)
            else:
                h_off_orig = np.random.uniform(ymin,
                                               ymin + hei - sample_bbox_size)

        else:
            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)

        w_off_orig = math.floor(w_off_orig)
        h_off_orig = math.floor(h_off_orig)

        # Figure out top left coordinates.
        w_off = float(w_off_orig / image_width)
        h_off = float(h_off_orig / image_height)

        sampled_bbox = [
            w_off, h_off, w_off + float(sample_bbox_size / image_width),
            h_off + float(sample_bbox_size / image_height)
        ]
        return sampled_bbox
    else:
        return 0


def jaccard_overlap(sample_bbox, object_bbox):
    if sample_bbox[0] >= object_bbox[2] or \
        sample_bbox[2] <= object_bbox[0] or \
        sample_bbox[1] >= object_bbox[3] or \
        sample_bbox[3] <= object_bbox[1]:
        return 0
    intersect_xmin = max(sample_bbox[0], object_bbox[0])
    intersect_ymin = max(sample_bbox[1], object_bbox[1])
    intersect_xmax = min(sample_bbox[2], object_bbox[2])
    intersect_ymax = min(sample_bbox[3], object_bbox[3])
    intersect_size = (intersect_xmax - intersect_xmin) * (
        intersect_ymax - intersect_ymin)
    sample_bbox_size = bbox_area(sample_bbox)
    object_bbox_size = bbox_area(object_bbox)
    overlap = intersect_size / (
        sample_bbox_size + object_bbox_size - intersect_size)
    return overlap


def intersect_bbox(bbox1, bbox2):
    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
        intersection_box = [0.0, 0.0, 0.0, 0.0]
    else:
        intersection_box = [
            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
        ]
    return intersection_box


def bbox_coverage(bbox1, bbox2):
    inter_box = intersect_bbox(bbox1, bbox2)
    intersect_size = bbox_area(inter_box)

    if intersect_size > 0:
        bbox1_size = bbox_area(bbox1)
        return intersect_size / bbox1_size
    else:
        return 0.


def satisfy_sample_constraint(sampler,
                              sample_bbox,
                              gt_bboxes,
                              satisfy_all=False):
    if sampler[6] == 0 and sampler[7] == 0:
        return True
    satisfied = []
    for i in range(len(gt_bboxes)):
        object_bbox = [
            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
        ]
        overlap = jaccard_overlap(sample_bbox, object_bbox)
        if sampler[6] != 0 and \
                overlap < sampler[6]:
            satisfied.append(False)
            continue
        if sampler[7] != 0 and \
                overlap > sampler[7]:
            satisfied.append(False)
            continue
        satisfied.append(True)
        if not satisfy_all:
            return True

    if satisfy_all:
        return np.all(satisfied)
    else:
        return False


def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
    if sampler[6] == 0 and sampler[7] == 0:
        has_jaccard_overlap = False
    else:
        has_jaccard_overlap = True
    if sampler[8] == 0 and sampler[9] == 0:
        has_object_coverage = False
    else:
        has_object_coverage = True

    if not has_jaccard_overlap and not has_object_coverage:
        return True
    found = False
    for i in range(len(gt_bboxes)):
        object_bbox = [
            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
        ]
        if has_jaccard_overlap:
            overlap = jaccard_overlap(sample_bbox, object_bbox)
            if sampler[6] != 0 and \
                    overlap < sampler[6]:
                continue
            if sampler[7] != 0 and \
                    overlap > sampler[7]:
                continue
            found = True
        if has_object_coverage:
            object_coverage = bbox_coverage(object_bbox, sample_bbox)
            if sampler[8] != 0 and \
                    object_coverage < sampler[8]:
                continue
            if sampler[9] != 0 and \
                    object_coverage > sampler[9]:
                continue
            found = True
        if found:
            return True
    return found


def crop_image_sampling(img, sample_bbox, image_width, image_height,
                        target_size):
    # no clipping here
    xmin = int(sample_bbox[0] * image_width)
    xmax = int(sample_bbox[2] * image_width)
    ymin = int(sample_bbox[1] * image_height)
    ymax = int(sample_bbox[3] * image_height)

    w_off = xmin
    h_off = ymin
    width = xmax - xmin
    height = ymax - ymin
    cross_xmin = max(0.0, float(w_off))
    cross_ymin = max(0.0, float(h_off))
    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
    cross_width = cross_xmax - cross_xmin
    cross_height = cross_ymax - cross_ymin

    roi_xmin = 0 if w_off >= 0 else abs(w_off)
    roi_ymin = 0 if h_off >= 0 else abs(h_off)
    roi_width = cross_width
    roi_height = cross_height

    roi_y1 = int(roi_ymin)
    roi_y2 = int(roi_ymin + roi_height)
    roi_x1 = int(roi_xmin)
    roi_x2 = int(roi_xmin + roi_width)

    cross_y1 = int(cross_ymin)
    cross_y2 = int(cross_ymin + cross_height)
    cross_x1 = int(cross_xmin)
    cross_x2 = int(cross_xmin + cross_width)

    sample_img = np.zeros((height, width, 3))
    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
        img[cross_y1: cross_y2, cross_x1: cross_x2]

    sample_img = cv2.resize(
        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)

    return sample_img


def is_poly(segm):
    assert isinstance(segm, (list, dict)), \
        "Invalid segm type: {}".format(type(segm))
    return isinstance(segm, list)


def gaussian_radius(bbox_size, min_overlap):
    height, width = bbox_size

    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
    radius1 = (b1 + sq1) / (2 * a1)

    a2 = 4
    b2 = 2 * (height + width)
    c2 = (1 - min_overlap) * width * height
    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
    radius2 = (b2 + sq2) / 2

    a3 = 4 * min_overlap
    b3 = -2 * min_overlap * (height + width)
    c3 = (min_overlap - 1) * width * height
    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
    radius3 = (b3 + sq3) / 2
    return min(radius1, radius2, radius3)


def draw_gaussian(heatmap, center, radius, k=1, delte=6):
    diameter = 2 * radius + 1
    sigma = diameter / delte
    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)

    x, y = center

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
                               radius + right]
    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)


def gaussian2D(shape, sigma_x=1, sigma_y=1):
    m, n = [(ss - 1.) / 2. for ss in shape]
    y, x = np.ogrid[-m:m + 1, -n:n + 1]

    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
                                                            sigma_y)))
    h[h < np.finfo(h.dtype).eps * h.max()] = 0
    return h


def draw_umich_gaussian(heatmap, center, radius, k=1):
    """
    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
    """
    diameter = 2 * radius + 1
    gaussian = gaussian2D(
        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)

    x, y = int(center[0]), int(center[1])

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
                               radius + right]
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
    return heatmap


def get_border(border, size):
    i = 1
    while size - border // i <= border // i:
        i *= 2
    return border // i


================================================
FILE: ppdet/data/transform/operators.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# function:
#    operators to process sample,
#    eg: decode/resize/crop image

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

from numbers import Number, Integral

import uuid
import random
import math
import numpy as np
import os
import copy
import logging
import cv2
from PIL import Image, ImageDraw, ImageEnhance
from pycocotools import mask
import pickle
import threading
MUTEX = threading.Lock()

import paddle
from ppdet.core.workspace import serializable
from ..reader import Compose

from .op_helper import (satisfy_sample_constraint, filter_and_process,
                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
                        satisfy_sample_constraint_coverage, crop_image_sampling,
                        generate_sample_bbox_square, bbox_area_sampling,
                        is_poly, get_border)

from ppdet.utils.logger import setup_logger
from ppdet.utils.compact import imagedraw_textsize_c

from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
logger = setup_logger(__name__)

registered_ops = []


def register_op(cls):
    registered_ops.append(cls.__name__)
    if not hasattr(BaseOperator, cls.__name__):
        setattr(BaseOperator, cls.__name__, cls)
    else:
        raise KeyError("The {} class has been registered.".format(cls.__name__))
    return serializable(cls)


class BboxError(ValueError):
    pass


class ImageError(ValueError):
    pass


class BaseOperator(object):
    def __init__(self, name=None):
        if name is None:
            name = self.__class__.__name__
        self._id = name + '_' + str(uuid.uuid4())[-6:]

    def apply(self, sample, context=None):
        """ Process a sample.
        Args:
            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
            context (dict): info about this sample processing
        Returns:
            result (dict): a processed sample
        """
        return sample

    def __call__(self, sample, context=None):
        """ Process a sample.
        Args:
            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
            context (dict): info about this sample processing
        Returns:
            result (dict): a processed sample
        """
        if isinstance(sample, Sequence):
            for i in range(len(sample)):
                sample[i] = self.apply(sample[i], context)
        else:
            sample = self.apply(sample, context)
        return sample

    def __str__(self):
        return str(self._id)


@register_op
class Decode(BaseOperator):
    def __init__(self, rtn_im_file=False):
        """ Transform the image data to numpy format following the rgb format
        """
        super(Decode, self).__init__()
        self.rtn_im_file = rtn_im_file

    def apply(self, sample, context=None):
        """ load image if 'im_file' field is not empty but 'image' is"""
        if 'image' not in sample:
            with open(sample['im_file'], 'rb') as f:
                sample['image'] = f.read()
            if not self.rtn_im_file:
                sample.pop('im_file')

        try:
            im = sample['image']
            data = np.frombuffer(im, dtype='uint8')
            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
            if 'keep_ori_im' in sample and sample['keep_ori_im']:
                sample['ori_image'] = im
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        except:
            im = sample['image']

        sample['image'] = im
        if 'h' not in sample:
            sample['h'] = im.shape[0]
        elif sample['h'] != im.shape[0]:
            logger.warning(
                "The actual image height: {} is not equal to the "
                "height: {} in annotation, and update sample['h'] by actual "
                "image height.".format(im.shape[0], sample['h']))
            sample['h'] = im.shape[0]
        if 'w' not in sample:
            sample['w'] = im.shape[1]
        elif sample['w'] != im.shape[1]:
            logger.warning(
                "The actual image width: {} is not equal to the "
                "width: {} in annotation, and update sample['w'] by actual "
                "image width.".format(im.shape[1], sample['w']))
            sample['w'] = im.shape[1]

        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
        return sample


def _make_dirs(dirname):
    try:
        from pathlib import Path
    except ImportError:
        from pathlib2 import Path
    Path(dirname).mkdir(exist_ok=True)


@register_op
class DecodeCache(BaseOperator):
    def __init__(self, cache_root=None):
        '''decode image and caching
        '''
        super(DecodeCache, self).__init__()

        self.use_cache = False if cache_root is None else True
        self.cache_root = cache_root

        if cache_root is not None:
            _make_dirs(cache_root)

    def apply(self, sample, context=None):

        if self.use_cache and os.path.exists(
                self.cache_path(self.cache_root, sample['im_file'])):
            path = self.cache_path(self.cache_root, sample['im_file'])
            im = self.load(path)

        else:
            if 'image' not in sample:
                with open(sample['im_file'], 'rb') as f:
                    sample['image'] = f.read()

            im = sample['image']
            data = np.frombuffer(im, dtype='uint8')
            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
            if 'keep_ori_im' in sample and sample['keep_ori_im']:
                sample['ori_image'] = im
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

            if self.use_cache and not os.path.exists(
                    self.cache_path(self.cache_root, sample['im_file'])):
                path = self.cache_path(self.cache_root, sample['im_file'])
                self.dump(im, path)

        sample['image'] = im
        sample['h'] = im.shape[0]
        sample['w'] = im.shape[1]

        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)

        sample.pop('im_file')

        return sample

    @staticmethod
    def cache_path(dir_oot, im_file):
        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')

    @staticmethod
    def load(path):
        with open(path, 'rb') as f:
            im = pickle.load(f)
        return im

    @staticmethod
    def dump(obj, path):
        MUTEX.acquire()
        try:
            with open(path, 'wb') as f:
                pickle.dump(obj, f)

        except Exception as e:
            logger.warning('dump {} occurs exception {}'.format(path, str(e)))

        finally:
            MUTEX.release()


@register_op
class SniperDecodeCrop(BaseOperator):
    def __init__(self):
        super(SniperDecodeCrop, self).__init__()

    def __call__(self, sample, context=None):
        if 'image' not in sample:
            with open(sample['im_file'], 'rb') as f:
                sample['image'] = f.read()
            sample.pop('im_file')

        im = sample['image']
        data = np.frombuffer(im, dtype='uint8')
        im = cv2.imdecode(data, cv2.IMREAD_COLOR)  # BGR mode, but need RGB mode
        if 'keep_ori_im' in sample and sample['keep_ori_im']:
            sample['ori_image'] = im
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

        chip = sample['chip']
        x1, y1, x2, y2 = [int(xi) for xi in chip]
        im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
            1]), :]

        sample['image'] = im
        h = im.shape[0]
        w = im.shape[1]
        # sample['im_info'] = [h, w, 1.0]
        sample['h'] = h
        sample['w'] = w

        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
        return sample


@register_op
class Permute(BaseOperator):
    def __init__(self):
        """
        Change the channel to be (C, H, W)
        """
        super(Permute, self).__init__()

    def apply(self, sample, context=None):
        im = sample['image']
        im = im.transpose((2, 0, 1))
        sample['image'] = im

        if 'pre_image' in sample:
            pre_im = sample['pre_image']
            pre_im = pre_im.transpose((2, 0, 1))
            sample['pre_image'] = pre_im
        return sample


@register_op
class Lighting(BaseOperator):
    """
    Lighting the image by eigenvalues and eigenvectors
    Args:
        eigval (list): eigenvalues
        eigvec (list): eigenvectors
        alphastd (float): random weight of lighting, 0.1 by default
    """

    def __init__(self, eigval, eigvec, alphastd=0.1):
        super(Lighting, self).__init__()
        self.alphastd = alphastd
        self.eigval = np.array(eigval).astype('float32')
        self.eigvec = np.array(eigvec).astype('float32')

    def apply(self, sample, context=None):
        alpha = np.random.normal(scale=self.alphastd, size=(3, ))
        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)

        if 'pre_image' in sample:
            sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)
        return sample


@register_op
class RandomErasingImage(BaseOperator):
    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
        """
        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
        Args:
            prob (float): probability to carry out random erasing
            lower (float): lower limit of the erasing area ratio
            higher (float): upper limit of the erasing area ratio
            aspect_ratio (float): aspect ratio of the erasing region
        """
        super(RandomErasingImage, self).__init__()
        self.prob = prob
        self.lower = lower
        self.higher = higher
        self.aspect_ratio = aspect_ratio

    def apply(self, sample, context=None):
        gt_bbox = sample['gt_bbox']
        im = sample['image']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image is not a numpy array.".format(self))
        if len(im.shape) != 3:
            raise ImageError("{}: image is not 3-dimensional.".format(self))

        for idx in range(gt_bbox.shape[0]):
            if self.prob <= np.random.rand():
                continue

            x1, y1, x2, y2 = gt_bbox[idx, :]
            w_bbox = x2 - x1
            h_bbox = y2 - y1
            area = w_bbox * h_bbox

            target_area = random.uniform(self.lower, self.higher) * area
            aspect_ratio = random.uniform(self.aspect_ratio,
                                          1 / self.aspect_ratio)

            h = int(round(math.sqrt(target_area * aspect_ratio)))
            w = int(round(math.sqrt(target_area / aspect_ratio)))

            if w < w_bbox and h < h_bbox:
                off_y1 = random.randint(0, int(h_bbox - h))
                off_x1 = random.randint(0, int(w_bbox - w))
                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
                    x1 + off_x1 + w), :] = 0
        sample['image'] = im
        return sample


@register_op
class NormalizeImage(BaseOperator):
    def __init__(self,
                 mean=[0.485, 0.456, 0.406],
                 std=[0.229, 0.224, 0.225],
                 is_scale=True,
                 norm_type='mean_std'):
        """
        Args:
            mean (list): the pixel mean
            std (list): the pixel variance
            is_scale (bool): scale the pixel to [0,1]
            norm_type (str): type in ['mean_std', 'none']
        """
        super(NormalizeImage, self).__init__()
        self.mean = mean
        self.std = std
        self.is_scale = is_scale
        self.norm_type = norm_type
        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
                isinstance(self.is_scale, bool) and
                self.norm_type in ['mean_std', 'none']):
            raise TypeError("{}: input type is invalid.".format(self))
        from functools import reduce
        if reduce(lambda x, y: x * y, self.std) == 0:
            raise ValueError('{}: std is invalid!'.format(self))

    def apply(self, sample, context=None):
        """Normalize the image.
        Operators:
            1.(optional) Scale the pixel to [0,1]
            2.(optional) Each pixel minus mean and is divided by std
        """
        im = sample['image']

        im = im.astype(np.float32, copy=False)
        if self.is_scale:
            scale = 1.0 / 255.0
            im *= scale

        if self.norm_type == 'mean_std':
            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
            std = np.array(self.std)[np.newaxis, np.newaxis, :]
            im -= mean
            im /= std

        sample['image'] = im

        if 'pre_image' in sample:
            pre_im = sample['pre_image']
            pre_im = pre_im.astype(np.float32, copy=False)
            if self.is_scale:
                scale = 1.0 / 255.0
                pre_im *= scale

            if self.norm_type == 'mean_std':
                mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
                std = np.array(self.std)[np.newaxis, np.newaxis, :]
                pre_im -= mean
                pre_im /= std
            sample['pre_image'] = pre_im

        return sample


@register_op
class GridMask(BaseOperator):
    def __init__(self,
                 use_h=True,
                 use_w=True,
                 rotate=1,
                 offset=False,
                 ratio=0.5,
                 mode=1,
                 prob=0.7,
                 upper_iter=360000):
        """
        GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
        Args:
            use_h (bool): whether to mask vertically
            use_w (boo;): whether to mask horizontally
            rotate (float): angle for the mask to rotate
            offset (float): mask offset
            ratio (float): mask ratio
            mode (int): gridmask mode
            prob (float): max probability to carry out gridmask
            upper_iter (int): suggested to be equal to global max_iter
        """
        super(GridMask, self).__init__()
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode = mode
        self.prob = prob
        self.upper_iter = upper_iter

        from .gridmask_utils import Gridmask
        self.gridmask_op = Gridmask(
            use_h,
            use_w,
            rotate=rotate,
            offset=offset,
            ratio=ratio,
            mode=mode,
            prob=prob,
            upper_iter=upper_iter)

    def apply(self, sample, context=None):
        sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])
        return sample


@register_op
class RandomDistort(BaseOperator):
    """Random color distortion.
    Args:
        hue (list): hue settings. in [lower, upper, probability] format.
        saturation (list): saturation settings. in [lower, upper, probability] format.
        contrast (list): contrast settings. in [lower, upper, probability] format.
        brightness (list): brightness settings. in [lower, upper, probability] format.
        random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order.
        count (int): the number of doing distrot.
        random_channel (bool): whether to swap channels randomly.
        prob (float): the probability of enhancing the sample.
    """

    def __init__(self,
                 hue=[-18, 18, 0.5],
                 saturation=[0.5, 1.5, 0.5],
                 contrast=[0.5, 1.5, 0.5],
                 brightness=[0.5, 1.5, 0.5],
                 random_apply=True,
                 count=4,
                 random_channel=False,
                 prob=1.0):
        super(RandomDistort, self).__init__()
        self.hue = hue
        self.saturation = saturation
        self.contrast = contrast
        self.brightness = brightness
        self.random_apply = random_apply
        self.count = count
        self.random_channel = random_channel
        self.prob = prob

    def apply_hue(self, img):
        low, high, prob = self.hue
        if np.random.uniform(0., 1.) < prob:
            return img
        delta = np.random.uniform(low, high)
        img = np.array(img.convert('HSV'))
        img[:, :, 0] = img[:, :, 0] + delta
        img = Image.fromarray(img, mode='HSV').convert('RGB')
        return img

    def apply_saturation(self, img):
        low, high, prob = self.saturation
        if np.random.uniform(0., 1.) < prob:
            return img
        delta = np.random.uniform(low, high)
        img = ImageEnhance.Color(img).enhance(delta)
        return img

    def apply_contrast(self, img):
        low, high, prob = self.contrast
        if np.random.uniform(0., 1.) < prob:
            return img
        delta = np.random.uniform(low, high)
        img = ImageEnhance.Contrast(img).enhance(delta)
        return img

    def apply_brightness(self, img):
        low, high, prob = self.brightness
        if np.random.uniform(0., 1.) < prob:
            return img
        delta = np.random.uniform(low, high)
        img = ImageEnhance.Brightness(img).enhance(delta)
        return img

    def apply(self, sample, context=None):
        if random.random() > self.prob:
            return sample
        img = sample['image']
        img = Image.fromarray(img.astype(np.uint8))
        if self.random_apply:
            functions = [
                self.apply_brightness, self.apply_contrast,
                self.apply_saturation, self.apply_hue
            ]
            distortions = np.random.permutation(functions)[:self.count]
            for func in distortions:
                img = func(img)
            img = np.asarray(img).astype(np.float32)
            sample['image'] = img
            return sample

        img = self.apply_brightness(img)
        mode = np.random.randint(0, 2)
        if mode:
            img = self.apply_contrast(img)
        img = self.apply_saturation(img)
        img = self.apply_hue(img)
        if not mode:
            img = self.apply_contrast(img)

        img = np.asarray(img).astype(np.float32)
        if self.random_channel:
            if np.random.randint(0, 2):
                img = img[..., np.random.permutation(3)]
        sample['image'] = img
        return sample


@register_op
class PhotoMetricDistortion(BaseOperator):
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.

    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels

    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        super(PhotoMetricDistortion, self).__init__()
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def apply(self, results, context=None):
        """Call function to perform photometric distortion on images.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Result dict with images distorted.
        """

        img = results['image']
        img = img.astype(np.float32)
        # random brightness
        if np.random.randint(2):
            delta = np.random.uniform(-self.brightness_delta,
                                      self.brightness_delta)
            img += delta

        # mode == 0 --> do random contrast first
        # mode == 1 --> do random contrast last
        mode = np.random.randint(2)
        if mode == 1:
            if np.random.randint(2):
                alpha = np.random.uniform(self.contrast_lower,
                                          self.contrast_upper)
                img *= alpha

        # convert color from BGR to HSV
        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

        # random saturation
        if np.random.randint(2):
            img[..., 1] *= np.random.uniform(self.saturation_lower,
                                             self.saturation_upper)

        # random hue
        if np.random.randint(2):
            img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
            img[..., 0][img[..., 0] > 360] -= 360
            img[..., 0][img[..., 0] < 0] += 360

        # convert color from HSV to BGR
        img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)

        # random contrast
        if mode == 0:
            if np.random.randint(2):
                alpha = np.random.uniform(self.contrast_lower,
                                          self.contrast_upper)
                img *= alpha

        # randomly swap channels
        if np.random.randint(2):
            img = img[..., np.random.permutation(3)]

        results['image'] = img
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
        repr_str += 'contrast_range='
        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
        repr_str += 'saturation_range='
        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
        repr_str += f'hue_delta={self.hue_delta})'
        return repr_str


@register_op
class AutoAugment(BaseOperator):
    def __init__(self, autoaug_type="v1"):
        """
        Args:
            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
        """
        super(AutoAugment, self).__init__()
        self.autoaug_type = autoaug_type

    def apply(self, sample, context=None):
        """
        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
        """
        im = sample['image']
        gt_bbox = sample['gt_bbox']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image is not a numpy array.".format(self))
        if len(im.shape) != 3:
            raise ImageError("{}: image is not 3-dimensional.".format(self))
        if len(gt_bbox) == 0:
            return sample

        height, width, _ = im.shape
        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)

        from .autoaugment_utils import distort_image_with_autoaugment
        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
                                                          self.autoaug_type)

        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)

        sample['image'] = im
        sample['gt_bbox'] = gt_bbox
        return sample


@register_op
class RandomFlip(BaseOperator):
    def __init__(self, prob=0.5):
        """
        Args:
            prob (float): the probability of flipping image
        """
        super(RandomFlip, self).__init__()
        self.prob = prob
        if not (isinstance(self.prob, float)):
            raise TypeError("{}: input type is invalid.".format(self))

    def apply_segm(self, segms, height, width):
        def _flip_poly(poly, width):
            flipped_poly = np.array(poly)
            flipped_poly[0::2] = width - np.array(poly[0::2])
            return flipped_poly.tolist()

        def _flip_rle(rle, height, width):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, height, width)
            mask = mask_util.decode(rle)
            mask = mask[:, ::-1]
            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
            return rle

        flipped_segms = []
        for segm in segms:
            if is_poly(segm):
                # Polygon format
                flipped_segms.append([_flip_poly(poly, width) for poly in segm])
            else:
                # RLE format
                import pycocotools.mask as mask_util
                flipped_segms.append(_flip_rle(segm, height, width))
        return flipped_segms

    def apply_keypoint(self, gt_keypoint, width):
        for i in range(gt_keypoint.shape[1]):
            if i % 2 == 0:
                old_x = gt_keypoint[:, i].copy()
                gt_keypoint[:, i] = width - old_x
        return gt_keypoint

    def apply_image(self, image):
        return image[:, ::-1, :]

    def apply_bbox(self, bbox, width):
        oldx1 = bbox[:, 0].copy()
        oldx2 = bbox[:, 2].copy()
        bbox[:, 0] = width - oldx2
        bbox[:, 2] = width - oldx1
        return bbox

    def apply(self, sample, context=None):
        """Filp the image and bounding box.
        Operators:
            1. Flip the image numpy.
            2. Transform the bboxes' x coordinates.
              (Must judge whether the coordinates are normalized!)
            3. Transform the segmentations' x coordinates.
              (Must judge whether the coordinates are normalized!)
        Output:
            sample: the image, bounding box and segmentation part
                    in sample are flipped.
        """
        if np.random.uniform(0, 1) < self.prob:
            im = sample['image']
            height, width = im.shape[:2]
            im = self.apply_image(im)
            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
                                                    width)
            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
                sample['gt_keypoint'] = self.apply_keypoint(
                    sample['gt_keypoint'], width)

            if 'semantic' in sample and sample['semantic']:
                sample['semantic'] = sample['semantic'][:, ::-1]

            if 'gt_segm' in sample and sample['gt_segm'].any():
                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]

            sample['flipped'] = True
            sample['image'] = im
        return sample


@register_op
class Resize(BaseOperator):
    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
        """
        Resize image to target size. if keep_ratio is True,
        resize the image's long side to the maximum of target_size
        if keep_ratio is False, resize the image to target size(h, w)
        Args:
            target_size (int|list): image target size
            keep_ratio (bool): whether keep_ratio or not, default true
            interp (int): the interpolation method
        """
        super(Resize, self).__init__()
        self.keep_ratio = keep_ratio
        self.interp = interp
        if not isinstance(target_size, (Integral, Sequence)):
            raise TypeError(
                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
                format(type(target_size)))
        if isinstance(target_size, Integral):
            target_size = [target_size, target_size]
        self.target_size = target_size

    def apply_image(self, image, scale):
        im_scale_x, im_scale_y = scale

        return cv2.resize(
            image,
            None,
            None,
            fx=im_scale_x,
            fy=im_scale_y,
            interpolation=self.interp)

    def apply_bbox(self, bbox, scale, size):
        im_scale_x, im_scale_y = scale
        resize_w, resize_h = size
        bbox[:, 0::2] *= im_scale_x
        bbox[:, 1::2] *= im_scale_y
        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
        return bbox

    def apply_area(self, area, scale):
        im_scale_x, im_scale_y = scale
        return area * im_scale_x * im_scale_y

    def apply_joints(self, joints, scale, size):
        im_scale_x, im_scale_y = scale
        resize_w, resize_h = size
        joints[..., 0] *= im_scale_x
        joints[..., 1] *= im_scale_y
        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
        return joints

    def apply_segm(self, segms, im_size, scale):
        def _resize_poly(poly, im_scale_x, im_scale_y):
            resized_poly = np.array(poly).astype('float32')
            resized_poly[0::2] *= im_scale_x
            resized_poly[1::2] *= im_scale_y
            return resized_poly.tolist()

        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, im_h, im_w)

            mask = mask_util.decode(rle)
            mask = cv2.resize(
                mask,
                None,
                None,
                fx=im_scale_x,
                fy=im_scale_y,
                interpolation=self.interp)
            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
            return rle

        im_h, im_w = im_size
        im_scale_x, im_scale_y = scale
        resized_segms = []
        for segm in segms:
            if is_poly(segm):
                # Polygon format
                resized_segms.append([
                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
                ])
            else:
                # RLE format
                import pycocotools.mask as mask_util
                resized_segms.append(
                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))

        return resized_segms

    def apply(self, sample, context=None):
        """ Resize the image numpy.
        """
        im = sample['image']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))

        # apply image
        if len(im.shape) == 3:
            im_shape = im.shape
        else:
            im_shape = im[0].shape

        if self.keep_ratio:
            im_size_min = np.min(im_shape[0:2])
            im_size_max = np.max(im_shape[0:2])

            target_size_min = np.min(self.target_size)
            target_size_max = np.max(self.target_size)

            im_scale = min(target_size_min / im_size_min,
                           target_size_max / im_size_max)

            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
        else:
            resize_h, resize_w = self.target_size

        im_scale_y = resize_h / im_shape[0]
        im_scale_x = resize_w / im_shape[1]

        if len(im.shape) == 3:
            im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
            sample['image'] = im.astype(np.float32)
        else:
            resized_images = []
            for one_im in im:
                applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])
                resized_images.append(applied_im)

            sample['image'] = np.array(resized_images)

        # 2d keypoints resize
        if 'kps2d' in sample.keys():
            kps2d = sample['kps2d']
            kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x
            kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y

            sample['kps2d'] = kps2d

        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
        if 'scale_factor' in sample:
            scale_factor = sample['scale_factor']
            sample['scale_factor'] = np.asarray(
                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
                dtype=np.float32)
        else:
            sample['scale_factor'] = np.asarray(
                [im_scale_y, im_scale_x], dtype=np.float32)

        # apply bbox
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
                                                [im_scale_x, im_scale_y],
                                                [resize_w, resize_h])

        # apply areas
        if 'gt_areas' in sample:
            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
                                                 [im_scale_x, im_scale_y])

        # apply polygon
        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
                                                [im_scale_x, im_scale_y])

        # apply semantic
        if 'semantic' in sample and sample['semantic']:
            semantic = sample['semantic']
            semantic = cv2.resize(
                semantic.astype('float32'),
                None,
                None,
                fx=im_scale_x,
                fy=im_scale_y,
                interpolation=self.interp)
            semantic = np.asarray(semantic).astype('int32')
            semantic = np.expand_dims(semantic, 0)
            sample['semantic'] = semantic

        # apply gt_segm
        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            masks = [
                cv2.resize(
                    gt_segm,
                    None,
                    None,
                    fx=im_scale_x,
                    fy=im_scale_y,
                    interpolation=cv2.INTER_NEAREST)
                for gt_segm in sample['gt_segm']
            ]
            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)

        if 'gt_joints' in sample:
            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
                                                    [im_scale_x, im_scale_y],
                                                    [resize_w, resize_h])

        return sample


@register_op
class MultiscaleTestResize(BaseOperator):
    def __init__(self,
                 origin_target_size=[800, 1333],
                 target_size=[],
                 interp=cv2.INTER_LINEAR,
                 use_flip=True):
        """
        Rescale image to the each size in target size, and capped at max_size.
        Args:
            origin_target_size (list): origin target size of image
            target_size (list): A list of target sizes of image.
            interp (int): the interpolation method.
            use_flip (bool): whether use flip augmentation.
        """
        super(MultiscaleTestResize, self).__init__()
        self.interp = interp
        self.use_flip = use_flip

        if not isinstance(target_size, Sequence):
            raise TypeError(
                "Type of target_size is invalid. Must be List or Tuple, now is {}".
                format(type(target_size)))
        self.target_size = target_size

        if not isinstance(origin_target_size, Sequence):
            raise TypeError(
                "Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
                format(type(origin_target_size)))

        self.origin_target_size = origin_target_size

    def apply(self, sample, context=None):
        """ Resize the image numpy for multi-scale test.
        """
        samples = []
        resizer = Resize(
            self.origin_target_size, keep_ratio=True, interp=self.interp)
        samples.append(resizer(sample.copy(), context))
        if self.use_flip:
            flipper = RandomFlip(1.1)
            samples.append(flipper(sample.copy(), context=context))

        for size in self.target_size:
            resizer = Resize(size, keep_ratio=True, interp=self.interp)
            samples.append(resizer(sample.copy(), context))

        return samples


@register_op
class RandomResize(BaseOperator):
    def __init__(self,
                 target_size,
                 keep_ratio=True,
                 interp=cv2.INTER_LINEAR,
                 random_range=False,
                 random_size=True,
                 random_interp=False):
        """
        Resize image to target size randomly. random target_size and interpolation method
        Args:
            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
            keep_ratio (bool): whether keep_raio or not, default true
            interp (int): the interpolation method
            random_range (bool): whether random select target size of image, the target_size must be
                a [[min_short_edge, long_edge], [max_short_edge, long_edge]]
            random_size (bool): whether random select target size of image
            random_interp (bool): whether random select interpolation method
        """
        super(RandomResize, self).__init__()
        self.keep_ratio = keep_ratio
        self.interp = interp
        self.interps = [
            cv2.INTER_NEAREST,
            cv2.INTER_LINEAR,
            cv2.INTER_AREA,
            cv2.INTER_CUBIC,
            cv2.INTER_LANCZOS4,
        ]
        assert isinstance(target_size, (
            Integral, Sequence)), "target_size must be Integer, List or Tuple"
        if (random_range or random_size) and not isinstance(target_size,
                                                            Sequence):
            raise TypeError(
                "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}".
                format(type(target_size)))
        if random_range and not len(target_size) == 2:
            raise TypeError(
                "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True."
            )
        self.target_size = target_size
        self.random_range = random_range
        self.random_size = random_size
        self.random_interp = random_interp

    def apply(self, sample, context=None):
        """ Resize the image numpy.
        """
        if self.random_range:
            short_edge = np.random.randint(self.target_size[0][0],
                                           self.target_size[1][0] + 1)
            long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)
            target_size = [short_edge, long_edge]
        else:
            if self.random_size:
                target_size = random.choice(self.target_size)
            else:
                target_size = self.target_size

        if self.random_interp:
            interp = random.choice(self.interps)
        else:
            interp = self.interp

        resizer = Resize(target_size, self.keep_ratio, interp)
        return resizer(sample, context=context)


@register_op
class RandomExpand(BaseOperator):
    """Random expand the canvas.
    Args:
        ratio (float): maximum expansion ratio.
        prob (float): probability to expand.
        fill_value (list): color value used to fill the canvas. in RGB order.
    """

    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
        super(RandomExpand, self).__init__()
        assert ratio > 1.01, "expand ratio must be larger than 1.01"
        self.ratio = ratio
        self.prob = prob
        assert isinstance(fill_value, (Number, Sequence)), \
            "fill value must be either float or sequence"
        if isinstance(fill_value, Number):
            fill_value = (fill_value, ) * 3
        if not isinstance(fill_value, tuple):
            fill_value = tuple(fill_value)
        self.fill_value = fill_value

    def apply(self, sample, context=None):
        if np.random.uniform(0., 1.) < self.prob:
            return sample

        im = sample['image']
        height, width = im.shape[:2]
        ratio = np.random.uniform(1., self.ratio)
        h = int(height * ratio)
        w = int(width * ratio)
        if not h > height or not w > width:
            return sample
        y = np.random.randint(0, h - height)
        x = np.random.randint(0, w - width)
        offsets, size = [x, y], [h, w]

        pad = Pad(size,
                  pad_mode=-1,
                  offsets=offsets,
                  fill_value=self.fill_value)

        return pad(sample, context=context)


@register_op
class CropWithSampling(BaseOperator):
    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
        """
        Args:
            batch_sampler (list): Multiple sets of different
                                  parameters for cropping.
            satisfy_all (bool): whether all boxes must satisfy.
            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
           [max sample, max trial, min scale, max scale,
            min aspect ratio, max aspect ratio,
            min overlap, max overlap]
            avoid_no_bbox (bool): whether to avoid the
                                  situation where the box does not appear.
        """
        super(CropWithSampling, self).__init__()
        self.batch_sampler = batch_sampler
        self.satisfy_all = satisfy_all
        self.avoid_no_bbox = avoid_no_bbox

    def apply(self, sample, context):
        """
        Crop the image and modify bounding box.
        Operators:
            1. Scale the image width and height.
            2. Crop the image according to a radom sample.
            3. Rescale the bounding box.
            4. Determine if the new bbox is satisfied in the new image.
        Returns:
            sample: the image, bounding box are replaced.
        """
        assert 'image' in sample, "image data not found"
        im = sample['image']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        im_height, im_width = im.shape[:2]
        gt_score = None
        if 'gt_score' in sample:
            gt_score = sample['gt_score']
        sampled_bbox = []
        gt_bbox = gt_bbox.tolist()
        for sampler in self.batch_sampler:
            found = 0
            for i in range(sampler[1]):
                if found >= sampler[0]:
                    break
                sample_bbox = generate_sample_bbox(sampler)
                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
                                             self.satisfy_all):
                    sampled_bbox.append(sample_bbox)
                    found = found + 1
        im = np.array(im)
        while sampled_bbox:
            idx = int(np.random.uniform(0, len(sampled_bbox)))
            sample_bbox = sampled_bbox.pop(idx)
            sample_bbox = clip_bbox(sample_bbox)
            crop_bbox, crop_class, crop_score = \
                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
            if self.avoid_no_bbox:
                if len(crop_bbox) < 1:
                    continue
            xmin = int(sample_bbox[0] * im_width)
            xmax = int(sample_bbox[2] * im_width)
            ymin = int(sample_bbox[1] * im_height)
            ymax = int(sample_bbox[3] * im_height)
            im = im[ymin:ymax, xmin:xmax]
            sample['image'] = im
            sample['gt_bbox'] = crop_bbox
            sample['gt_class'] = crop_class
            sample['gt_score'] = crop_score
            return sample
        return sample


@register_op
class CropWithDataAchorSampling(BaseOperator):
    def __init__(self,
                 batch_sampler,
                 anchor_sampler=None,
                 target_size=None,
                 das_anchor_scales=[16, 32, 64, 128],
                 sampling_prob=0.5,
                 min_size=8.,
                 avoid_no_bbox=True):
        """
        Args:
            anchor_sampler (list): anchor_sampling sets of different
                                  parameters for cropping.
            batch_sampler (list): Multiple sets of different
                                  parameters for cropping.
              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
              [max sample, max trial, min scale, max scale,
               min aspect ratio, max aspect ratio,
               min overlap, max overlap, min coverage, max coverage]
            target_size (int): target image size.
            das_anchor_scales (list[float]): a list of anchor scales in data
                anchor smapling.
            min_size (float): minimum size of sampled bbox.
            avoid_no_bbox (bool): whether to avoid the
                                  situation where the box does not appear.
        """
        super(CropWithDataAchorSampling, self).__init__()
        self.anchor_sampler = anchor_sampler
        self.batch_sampler = batch_sampler
        self.target_size = target_size
        self.sampling_prob = sampling_prob
        self.min_size = min_size
        self.avoid_no_bbox = avoid_no_bbox
        self.das_anchor_scales = np.array(das_anchor_scales)

    def apply(self, sample, context):
        """
        Crop the image and modify bounding box.
        Operators:
            1. Scale the image width and height.
            2. Crop the image according to a radom sample.
            3. Rescale the bounding box.
            4. Determine if the new bbox is satisfied in the new image.
        Returns:
            sample: the image, bounding box are replaced.
        """
        assert 'image' in sample, "image data not found"
        im = sample['image']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        image_height, image_width = im.shape[:2]
        gt_bbox[:, 0] /= image_width
        gt_bbox[:, 1] /= image_height
        gt_bbox[:, 2] /= image_width
        gt_bbox[:, 3] /= image_height
        gt_score = None
        if 'gt_score' in sample:
            gt_score = sample['gt_score']
        sampled_bbox = []
        gt_bbox = gt_bbox.tolist()

        prob = np.random.uniform(0., 1.)
        if prob > self.sampling_prob:  # anchor sampling
            assert self.anchor_sampler
            for sampler in self.anchor_sampler:
                found = 0
                for i in range(sampler[1]):
                    if found >= sampler[0]:
                        break
                    sample_bbox = data_anchor_sampling(
                        gt_bbox, image_width, image_height,
                        self.das_anchor_scales, self.target_size)
                    if sample_bbox == 0:
                        break
                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
                                                          gt_bbox):
                        sampled_bbox.append(sample_bbox)
                        found = found + 1
            im = np.array(im)
            while sampled_bbox:
                idx = int(np.random.uniform(0, len(sampled_bbox)))
                sample_bbox = sampled_bbox.pop(idx)

                if 'gt_keypoint' in sample.keys():
                    keypoints = (sample['gt_keypoint'],
                                 sample['keypoint_ignore'])
                    crop_bbox, crop_class, crop_score, gt_keypoints = \
                        filter_and_process(sample_bbox, gt_bbox, gt_class,
                                scores=gt_score,
                                keypoints=keypoints)
                else:
                    crop_bbox, crop_class, crop_score = filter_and_process(
                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
                crop_bbox, crop_class, crop_score = bbox_area_sampling(
                    crop_bbox, crop_class, crop_score, self.target_size,
                    self.min_size)

                if self.avoid_no_bbox:
                    if len(crop_bbox) < 1:
                        continue
                im = crop_image_sampling(im, sample_bbox, image_width,
                                         image_height, self.target_size)
                height, width = im.shape[:2]
                crop_bbox[:, 0] *= width
                crop_bbox[:, 1] *= height
                crop_bbox[:, 2] *= width
                crop_bbox[:, 3] *= height
                sample['image'] = im
                sample['gt_bbox'] = crop_bbox
                sample['gt_class'] = crop_class
                if 'gt_score' in sample:
                    sample['gt_score'] = crop_score
                if 'gt_keypoint' in sample.keys():
                    sample['gt_keypoint'] = gt_keypoints[0]
                    sample['keypoint_ignore'] = gt_keypoints[1]
                return sample
            return sample

        else:
            for sampler in self.batch_sampler:
                found = 0
                for i in range(sampler[1]):
                    if found >= sampler[0]:
                        break
                    sample_bbox = generate_sample_bbox_square(
                        sampler, image_width, image_height)
                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
                                                          gt_bbox):
                        sampled_bbox.append(sample_bbox)
                        found = found + 1
            im = np.array(im)
            while sampled_bbox:
                idx = int(np.random.uniform(0, len(sampled_bbox)))
                sample_bbox = sampled_bbox.pop(idx)
                sample_bbox = clip_bbox(sample_bbox)

                if 'gt_keypoint' in sample.keys():
                    keypoints = (sample['gt_keypoint'],
                                 sample['keypoint_ignore'])
                    crop_bbox, crop_class, crop_score, gt_keypoints = \
                        filter_and_process(sample_bbox, gt_bbox, gt_class,
                                scores=gt_score,
                                keypoints=keypoints)
                else:
                    crop_bbox, crop_class, crop_score = filter_and_process(
                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
                # sampling bbox according the bbox area
                crop_bbox, crop_class, crop_score = bbox_area_sampling(
                    crop_bbox, crop_class, crop_score, self.target_size,
                    self.min_size)

                if self.avoid_no_bbox:
                    if len(crop_bbox) < 1:
                        continue
                xmin = int(sample_bbox[0] * image_width)
                xmax = int(sample_bbox[2] * image_width)
                ymin = int(sample_bbox[1] * image_height)
                ymax = int(sample_bbox[3] * image_height)
                im = im[ymin:ymax, xmin:xmax]
                height, width = im.shape[:2]
                crop_bbox[:, 0] *= width
                crop_bbox[:, 1] *= height
                crop_bbox[:, 2] *= width
                crop_bbox[:, 3] *= height
                sample['image'] = im
                sample['gt_bbox'] = crop_bbox
                sample['gt_class'] = crop_class
                if 'gt_score' in sample:
                    sample['gt_score'] = crop_score
                if 'gt_keypoint' in sample.keys():
                    sample['gt_keypoint'] = gt_keypoints[0]
                    sample['keypoint_ignore'] = gt_keypoints[1]
                return sample
            return sample


@register_op
class RandomCrop(BaseOperator):
    """Random crop image and bboxes.
    Args:
        aspect_ratio (list): aspect ratio of cropped region.
            in [min, max] format.
        thresholds (list): iou thresholds for decide a valid bbox crop.
        scaling (list): ratio between a cropped region and the original image.
             in [min, max] format.
        num_attempts (int): number of tries before giving up.
        allow_no_crop (bool): allow return without actually cropping them.
        cover_all_box (bool): ensure all bboxes are covered in the final crop.
        is_mask_crop(bool): whether crop the segmentation.
    """

    def __init__(self,
                 aspect_ratio=[.5, 2.],
                 thresholds=[.0, .1, .3, .5, .7, .9],
                 scaling=[.3, 1.],
                 num_attempts=50,
                 allow_no_crop=True,
                 cover_all_box=False,
                 is_mask_crop=False,
                 ioumode="iou",
                 prob=1.0):
        super(RandomCrop, self).__init__()
        self.aspect_ratio = aspect_ratio
        self.thresholds = thresholds
        self.scaling = scaling
        self.num_attempts = num_attempts
        self.allow_no_crop = allow_no_crop
        self.cover_all_box = cover_all_box
        self.is_mask_crop = is_mask_crop
        self.ioumode = ioumode
        self.prob = prob

    def crop_segms(self, segms, valid_ids, crop, height, width):
        def _crop_poly(segm, crop):
            xmin, ymin, xmax, ymax = crop
            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
            crop_p = np.array(crop_coord).reshape(4, 2)
            crop_p = Polygon(crop_p)

            crop_segm = list()
            for poly in segm:
                poly = np.array(poly).reshape(len(poly) // 2, 2)
                polygon = Polygon(poly)
                if not polygon.is_valid:
                    exterior = polygon.exterior
                    multi_lines = exterior.intersection(exterior)
                    polygons = shapely.ops.polygonize(multi_lines)
                    polygon = MultiPolygon(polygons)
                multi_polygon = list()
                if isinstance(polygon, MultiPolygon):
                    multi_polygon = copy.deepcopy(polygon)
                else:
                    multi_polygon.append(copy.deepcopy(polygon))
                for per_polygon in multi_polygon:
                    inter = per_polygon.intersection(crop_p)
                    if not inter:
                        continue
                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
                        for part in inter:
                            if not isinstance(part, Polygon):
                                continue
                            part = np.squeeze(
                                np.array(part.exterior.coords[:-1]).reshape(1,
                                                                            -1))
                            part[0::2] -= xmin
                            part[1::2] -= ymin
                            crop_segm.append(part.tolist())
                    elif isinstance(inter, Polygon):
                        crop_poly = np.squeeze(
                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
                        crop_poly[0::2] -= xmin
                        crop_poly[1::2] -= ymin
                        crop_segm.append(crop_poly.tolist())
                    else:
                        continue
            return crop_segm

        def _crop_rle(rle, crop, height, width):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, height, width)
            mask = mask_util.decode(rle)
            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
            return rle

        crop_segms = []
        for id in valid_ids:
            segm = self.polygon_to_rle(segms[id], height, width)
            if is_poly(segm):
                import copy
                import shapely.ops
                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
                logging.getLogger("shapely").setLevel(logging.WARNING)
                # Polygon format
                crop_segms.append(_crop_poly(segm, crop))
            else:
                # RLE format
                import pycocotools.mask as mask_util
                res = _crop_rle(segm, crop, height, width)
                crop_segms.append(self.rle_to_polygon(res))
        return crop_segms

    def polygon_to_rle(self, polygons, height, width):
        # Create an empty mask
        mask_img = np.zeros((height, width), dtype=np.uint8)

        # Fill the polygon in the mask
        for polygon in polygons:
            contour = np.array(polygon).reshape((-1, 1, 2)).astype(int)
            cv2.drawContours(mask_img, [contour], 0, 255, -1)

        # Convert binary mask to RLE
        rle = mask.encode(np.asfortranarray(mask_img))
        return rle

    def rle_to_polygon(self, rle_mask, min_area=5):
        binary_mask = mask.decode(rle_mask).squeeze()
        # Find contours in the binary mask
        contours, _ = cv2.findContours(
            binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL,
            cv2.CHAIN_APPROX_SIMPLE)
        polygons = []
        for contour in contours:
            # Convert contour to polygon and filter small areas
            if cv2.contourArea(contour) >= min_area:
                # Flatten list and add to polygons
                polygon = contour.flatten().tolist()
                if len(polygon) > 4:
                    polygons.append(polygon)
        return polygons

    def set_fake_bboxes(self, sample):
        sample['gt_bbox'] = np.array(
            [
                [32, 32, 128, 128],
                [32, 32, 128, 256],
                [32, 64, 128, 128],
                [32, 64, 128, 256],
                [64, 64, 128, 256],
                [64, 64, 256, 256],
                [64, 32, 128, 256],
                [64, 32, 128, 256],
                [96, 32, 128, 256],
                [96, 32, 128, 256],
            ],
            dtype=np.float32)
        sample['gt_class'] = np.array(
            [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)
        return sample

    def apply(self, sample, context=None):
        if random.random() > self.prob:
            return sample

        if 'gt_bbox' not in sample:
            # only used in semi-det as unsup data
            sample = self.set_fake_bboxes(sample)
            sample = self.random_crop(sample, fake_bboxes=True)
            del sample['gt_bbox']
            del sample['gt_class']
            return sample

        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
            return sample
        sample = self.random_crop(sample)
        return sample

    def random_crop(self, sample, fake_bboxes=False):
        h, w = sample['image'].shape[:2]
        gt_bbox = sample['gt_bbox']

        # NOTE Original method attempts to generate one candidate for each
        # threshold then randomly sample one from the resulting list.
        # Here a short circuit approach is taken, i.e., randomly choose a
        # threshold and attempt to find a valid crop, and simply return the
        # first one found.
        # The probability is not exactly the same, kinda resembling the
        # "Monty Hall" problem. Actually carrying out the attempts will affect
        # observability (just like opening doors in the "Monty Hall" game).
        thresholds = list(self.thresholds)
        if self.allow_no_crop:
            thresholds.append('no_crop')
        np.random.shuffle(thresholds)

        for thresh in thresholds:
            if thresh == 'no_crop':
                return sample

            found = False
            for i in range(self.num_attempts):
                scale = np.random.uniform(*self.scaling)
                if self.aspect_ratio is not None:
                    min_ar, max_ar = self.aspect_ratio
                    aspect_ratio = np.random.uniform(
                        max(min_ar, scale**2), min(max_ar, scale**-2))
                    h_scale = scale / np.sqrt(aspect_ratio)
                    w_scale = scale * np.sqrt(aspect_ratio)
                else:
                    h_scale = np.random.uniform(*self.scaling)
                    w_scale = np.random.uniform(*self.scaling)
                crop_h = h * h_scale
                crop_w = w * w_scale
                if self.aspect_ratio is None:
                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
                        continue

                crop_h = int(crop_h)
                crop_w = int(crop_w)
                crop_y = np.random.randint(0, h - crop_h)
                crop_x = np.random.randint(0, w - crop_w)
                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
                if self.ioumode == "iof":
                    iou = self._gtcropiou_matrix(
                        gt_bbox, np.array(
                            [crop_box], dtype=np.float32))
                elif self.ioumode == "iou":
                    iou = self._iou_matrix(
                        gt_bbox, np.array(
                            [crop_box], dtype=np.float32))
                if iou.max() < thresh:
                    continue

                if self.cover_all_box and iou.min() < thresh:
                    continue

                cropped_box, valid_ids = self._crop_box_with_center_constraint(
                    gt_bbox, np.array(
                        crop_box, dtype=np.float32))
                if valid_ids.size > 0:
                    found = True
                    break

            if found:
                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
                        'gt_poly']) > 0:
                    crop_polys = self.crop_segms(
                        sample['gt_poly'],
                        valid_ids,
                        np.array(
                            crop_box, dtype=np.int64),
                        h,
                        w)
                    if [] in crop_polys:
                        delete_id = list()
                        valid_polys = list()
                        for id, crop_poly in enumerate(crop_polys):
                            if crop_poly == []:
                                delete_id.append(id)
                            else:
                                valid_polys.append(crop_poly)
                        valid_ids = np.delete(valid_ids, delete_id)
                        if len(valid_polys) == 0:
                            return sample
                        sample['gt_poly'] = valid_polys
                    else:
                        sample['gt_poly'] = crop_polys

                if 'gt_segm' in sample:
                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
                                                        crop_box)
                    sample['gt_segm'] = np.take(
                        sample['gt_segm'], valid_ids, axis=0)

                sample['image'] = self._crop_image(sample['image'], crop_box)
                if fake_bboxes == True:
                    return sample

                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
                sample['gt_class'] = np.take(
                    sample['gt_class'], valid_ids, axis=0)
                if 'gt_score' in sample:
                    sample['gt_score'] = np.take(
                        sample['gt_score'], valid_ids, axis=0)

                if 'is_crowd' in sample:
                    sample['is_crowd'] = np.take(
                        sample['is_crowd'], valid_ids, axis=0)

                if 'difficult' in sample:
                    sample['difficult'] = np.take(
                        sample['difficult'], valid_ids, axis=0)

                if 'gt_joints' in sample:
                    sample['gt_joints'] = self._crop_joints(sample['gt_joints'],
                                                            crop_box)

                return sample

        return sample

    def _iou_matrix(self, a, b):
        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
        area_o = (area_a[:, np.newaxis] + area_b - area_i)
        return area_i / (area_o + 1e-10)

    def _gtcropiou_matrix(self, a, b):
        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
        area_o = (area_a[:, np.newaxis] + area_b - area_i)
        return area_i / (area_a + 1e-10)

    def _crop_box_with_center_constraint(self, box, crop):
        cropped_box = box.copy()

        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
        cropped_box[:, :2] -= crop[:2]
        cropped_box[:, 2:] -= crop[:2]

        centers = (box[:, :2] + box[:, 2:]) / 2
        valid = np.logical_and(crop[:2] <= centers,
                               centers < crop[2:]).all(axis=1)
        valid = np.logical_and(
            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))

        return cropped_box, np.where(valid)[0]

    def _crop_image(self, img, crop):
        x1, y1, x2, y2 = crop
        return img[y1:y2, x1:x2, :]

    def _crop_segm(self, segm, crop):
        x1, y1, x2, y2 = crop
        return segm[:, y1:y2, x1:x2]

    def _crop_joints(self, joints, crop):
        x1, y1, x2, y2 = crop
        joints[joints[..., 0] > x2, :] = 0
        joints[joints[..., 1] > y2, :] = 0
        joints[joints[..., 0] < x1, :] = 0
        joints[joints[..., 1] < y1, :] = 0
        joints[..., 0] -= x1
        joints[..., 1] -= y1
        return joints


@register_op
class RandomScaledCrop(BaseOperator):
    """Resize image and bbox based on long side (with optional random scaling),
       then crop or pad image to target size.
    Args:
        target_size (int|list): target size, "hw" format.
        scale_range (list): random scale range.
        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
        fill_value (float|list|tuple): color value used to fill the canvas,
            in RGB order.
    """

    def __init__(self,
                 target_size=512,
                 scale_range=[.1, 2.],
                 interp=cv2.INTER_LINEAR,
                 fill_value=(123.675, 116.28, 103.53)):
        super(RandomScaledCrop, self).__init__()
        assert isinstance(target_size, (
            Integral, Sequence)), "target_size must be Integer, List or Tuple"
        if isinstance(target_size, Integral):
            target_size = [target_size, ] * 2

        self.target_size = target_size
        self.scale_range = scale_range
        self.interp = interp
        assert isinstance(fill_value, (Number, Sequence)), \
            "fill value must be either float or sequence"
        if isinstance(fill_value, Number):
            fill_value = (fill_value, ) * 3
        if not isinstance(fill_value, tuple):
            fill_value = tuple(fill_value)
        self.fill_value = fill_value

    def apply_image(self, img, output_size, offset_x, offset_y):
        th, tw = self.target_size
        rh, rw = output_size
        img = cv2.resize(
            img, (rw, rh), interpolation=self.interp).astype(np.float32)
        canvas = np.ones([th, tw, 3], dtype=np.float32)
        canvas *= np.array(self.fill_value, dtype=np.float32)
        canvas[:min(th, rh), :min(tw, rw)] = \
            img[offset_y:offset_y + th, offset_x:offset_x + tw]
        return canvas

    def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):
        th, tw = self.target_size
        shift_array = np.array(
            [
                offset_x,
                offset_y,
            ] * 2, dtype=np.float32)
        boxes = gt_bbox * scale - shift_array
        boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)
        boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)
        # filter boxes with no area
        area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
        valid = (area > 1.).nonzero()[0]
        return boxes[valid], gt_class[valid], valid

    def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):
        th, tw = self.target_size
        rh, rw = output_size
        out_segms = []
        for segm in segms:
            segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)
            segm = segm.astype(np.float32)
            canvas = np.zeros([th, tw], dtype=segm.dtype)
            canvas[:min(th, rh), :min(tw, rw)] = \
                segm[offset_y:offset_y + th, offset_x:offset_x + tw]
            out_segms.append(canvas)
        out_segms = np.stack(out_segms)
        return out_segms if valid is None else out_segms[valid]

    def apply(self, sample, context=None):
        img = sample['image']
        h, w = img.shape[:2]
        random_scale = np.random.uniform(*self.scale_range)
        target_scale_size = [t * random_scale for t in self.target_size]
        # Compute actual rescaling applied to image.
        scale = min(target_scale_size[0] / h, target_scale_size[1] / w)
        output_size = [int(round(h * scale)), int(round(w * scale))]
        # get offset
        offset_x = int(
            max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))
        offset_y = int(
            max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))

        # apply to image
        sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)

        # apply to bbox
        valid = None
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(
                sample['gt_bbox'], sample['gt_class'], scale, offset_x,
                offset_y)

        # apply to segm
        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,
                                                offset_x, offset_y, valid)

        sample['im_shape'] = np.asarray(output_size, dtype=np.float32)
        scale_factor = sample['scale_factor']
        sample['scale_factor'] = np.asarray(
            [scale_factor[0] * scale, scale_factor[1] * scale],
            dtype=np.float32)

        return sample


@register_op
class Cutmix(BaseOperator):
    def __init__(self, alpha=1.5, beta=1.5):
        """
        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
        Cutmix image and gt_bbbox/gt_score
        Args:
             alpha (float): alpha parameter of beta distribute
             beta (float): beta parameter of beta distribute
        """
        super(Cutmix, self).__init__()
        self.alpha = alpha
        self.beta = beta
        if self.alpha <= 0.0:
            raise ValueError("alpha shold be positive in {}".format(self))
        if self.beta <= 0.0:
            raise ValueError("beta shold be positive in {}".format(self))

    def apply_image(self, img1, img2, factor):
        """ _rand_bbox """
        h = max(img1.shape[0], img2.shape[0])
        w = max(img1.shape[1], img2.shape[1])
        cut_rat = np.sqrt(1. - factor)

        cut_w = np.int32(w * cut_rat)
        cut_h = np.int32(h * cut_rat)

        # uniform
        cx = np.random.randint(w)
        cy = np.random.randint(h)

        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)

        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
            img1.astype('float32')
        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
            img2.astype('float32')
        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
        return img_1_pad

    def __call__(self, sample, context=None):
        if not isinstance(sample, Sequence):
            return sample

        assert len(sample) == 2, 'cutmix need two samples'

        factor = np.random.beta(self.alpha, self.beta)
        factor = max(0.0, min(1.0, factor))
        if factor >= 1.0:
            return sample[0]
        if factor <= 0.0:
            return sample[1]
        img1 = sample[0]['image']
        img2 = sample[1]['image']
        img = self.apply_image(img1, img2, factor)
        gt_bbox1 = sample[0]['gt_bbox']
        gt_bbox2 = sample[1]['gt_bbox']
        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
        gt_class1 = sample[0]['gt_class']
        gt_class2 = sample[1]['gt_class']
        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
        gt_score1 = np.ones_like(sample[0]['gt_class'])
        gt_score2 = np.ones_like(sample[1]['gt_class'])
        gt_score = np.concatenate(
            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
        result = copy.deepcopy(sample[0])
        result['image'] = img
        result['gt_bbox'] = gt_bbox
        result['gt_score'] = gt_score
        result['gt_class'] = gt_class
        if 'is_crowd' in sample[0]:
            is_crowd1 = sample[0]['is_crowd']
            is_crowd2 = sample[1]['is_crowd']
            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
            result['is_crowd'] = is_crowd
        if 'difficult' in sample[0]:
            is_difficult1 = sample[0]['difficult']
            is_difficult2 = sample[1]['difficult']
            is_difficult = np.concatenate(
                (is_difficult1, is_difficult2), axis=0)
            result['difficult'] = is_difficult
        return result


@register_op
class Mixup(BaseOperator):
    def __init__(self, alpha=1.5, beta=1.5):
        """ Mixup image and gt_bbbox/gt_score
        Args:
            alpha (float): alpha parameter of beta distribute
            beta (float): beta parameter of beta distribute
        """
        super(Mixup, self).__init__()
        self.alpha = alpha
        self.beta = beta
        if self.alpha <= 0.0:
            raise ValueError("alpha shold be positive in {}".format(self))
        if self.beta <= 0.0:
            raise ValueError("beta shold be positive in {}".format(self))

    def apply_image(self, img1, img2, factor):
        h = max(img1.shape[0], img2.shape[0])
        w = max(img1.shape[1], img2.shape[1])
        img = np.zeros((h, w, img1.shape[2]), 'float32')
        img[:img1.shape[0], :img1.shape[1], :] = \
            img1.astype('float32') * factor
        img[:img2.shape[0], :img2.shape[1], :] += \
            img2.astype('float32') * (1.0 - factor)
        return img.astype('uint8')

    def __call__(self, sample, context=None):
        if not isinstance(sample, Sequence):
            return sample

        assert len(sample) == 2, 'mixup need two samples'

        factor = np.random.beta(self.alpha, self.beta)
        factor = max(0.0, min(1.0, factor))
        if factor >= 1.0:
            return sample[0]
        if factor <= 0.0:
            return sample[1]
        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
        result = copy.deepcopy(sample[0])
        result['image'] = im
        # apply bbox and score
        if 'gt_bbox' in sample[0]:
            gt_bbox1 = sample[0]['gt_bbox']
            gt_bbox2 = sample[1]['gt_bbox']
            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
            result['gt_bbox'] = gt_bbox
        if 'gt_class' in sample[0]:
            gt_class1 = sample[0]['gt_class']
            gt_class2 = sample[1]['gt_class']
            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
            result['gt_class'] = gt_class

            gt_score1 = np.ones_like(sample[0]['gt_class'])
            gt_score2 = np.ones_like(sample[1]['gt_class'])
            gt_score = np.concatenate(
                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
            result['gt_score'] = gt_score.astype('float32')
        if 'is_crowd' in sample[0]:
            is_crowd1 = sample[0]['is_crowd']
            is_crowd2 = sample[1]['is_crowd']
            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
            result['is_crowd'] = is_crowd
        if 'difficult' in sample[0]:
            is_difficult1 = sample[0]['difficult']
            is_difficult2 = sample[1]['difficult']
            is_difficult = np.concatenate(
                (is_difficult1, is_difficult2), axis=0)
            result['difficult'] = is_difficult

        if 'gt_ide' in sample[0]:
            gt_ide1 = sample[0]['gt_ide']
            gt_ide2 = sample[1]['gt_ide']
            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
            result['gt_ide'] = gt_ide
        return result


@register_op
class NormalizeBox(BaseOperator):
    """Transform the bounding box's coornidates to [0,1]."""

    def __init__(self, retain_origin_box=False):
        super(NormalizeBox, self).__init__()
        self.retain_origin_box = retain_origin_box

    def apply(self, sample, context):
        im = sample['image']
        if 'gt_bbox' in sample.keys():
            if self.retain_origin_box:
                sample['origin_gt_bbox'] = sample['gt_bbox'].copy()
                sample['origin_gt_class'] = sample['gt_class'].copy()

            gt_bbox = sample['gt_bbox']
            height, width, _ = im.shape
            for i in range(gt_bbox.shape[0]):
                gt_bbox[i][0] = gt_bbox[i][0] / width
                gt_bbox[i][1] = gt_bbox[i][1] / height
                gt_bbox[i][2] = gt_bbox[i][2] / width
                gt_bbox[i][3] = gt_bbox[i][3] / height
            sample['gt_bbox'] = gt_bbox

            if 'gt_keypoint' in sample.keys():
                gt_keypoint = sample['gt_keypoint']

                for i in range(gt_keypoint.shape[1]):
                    if i % 2:
                        gt_keypoint[:, i] = gt_keypoint[:, i] / height
                    else:
                        gt_keypoint[:, i] = gt_keypoint[:, i] / width
                sample['gt_keypoint'] = gt_keypoint

            return sample
        else:
            return sample


@register_op
class BboxXYXY2XYWH(BaseOperator):
    """
    Convert bbox XYXY format to XYWH format.
    """

    def __init__(self):
        super(BboxXYXY2XYWH, self).__init__()

    def apply(self, sample, context=None):
        if 'gt_bbox' in sample.keys():
            bbox = sample['gt_bbox']
            bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
            bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
            sample['gt_bbox'] = bbox
            return sample
        else:
            return sample


@register_op
class PadBox(BaseOperator):
    def __init__(self, num_max_boxes=50):
        """
        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
        Args:
            num_max_boxes (int): the max number of bboxes
        """
        self.num_max_boxes = num_max_boxes
        super(PadBox, self).__init__()

    def apply(self, sample, context=None):
        assert 'gt_bbox' in sample
        bbox = sample['gt_bbox']
        gt_num = min(self.num_max_boxes, len(bbox))
        num_max = self.num_max_boxes
        # fields = context['fields'] if context else []
        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
        if gt_num > 0:
            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
        sample['gt_bbox'] = pad_bbox
        if 'gt_class' in sample:
            pad_class = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
            sample['gt_class'] = pad_class
        if 'gt_score' in sample:
            pad_score = np.zeros((num_max, ), dtype=np.float32)
            if gt_num > 0:
                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
            sample['gt_score'] = pad_score
        # in training, for example in op ExpandImage,
        # the bbox and gt_class is expandded, but the difficult is not,
        # so, judging by it's length
        if 'difficult' in sample:
            pad_diff = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
            sample['difficult'] = pad_diff
        if 'is_crowd' in sample:
            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
            sample['is_crowd'] = pad_crowd
        if 'gt_ide' in sample:
            pad_ide = np.zeros((num_max, ), dtype=np.int32)
            if gt_num > 0:
                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
            sample['gt_ide'] = pad_ide
        return sample


@register_op
class DebugVisibleImage(BaseOperator):
    """
    In debug mode, visualize images according to `gt_box`.
    (Currently only supported when not cropping and flipping image.)
    """

    def __init__(self, output_dir='output/debug', is_normalized=False):
        super(DebugVisibleImage, self).__init__()
        self.is_normalized = is_normalized
        self.output_dir = output_dir
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        if not isinstance(self.is_normalized, bool):
            raise TypeError("{}: input type is invalid.".format(self))

    def apply(self, sample, context=None):
        image = Image.fromarray(sample['image'].astype(np.uint8))
        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
        width = sample['w']
        height = sample['h']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        draw = ImageDraw.Draw(image)
        for i in range(gt_bbox.shape[0]):
            if self.is_normalized:
                gt_bbox[i][0] = gt_bbox[i][0] * width
                gt_bbox[i][1] = gt_bbox[i][1] * height
                gt_bbox[i][2] = gt_bbox[i][2] * width
                gt_bbox[i][3] = gt_bbox[i][3] * height

            xmin, ymin, xmax, ymax = gt_bbox[i]
            draw.line(
                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
                 (xmin, ymin)],
                width=2,
                fill='green')
            # draw label
            text = str(gt_class[i][0])
            tw, th = imagedraw_textsize_c(draw, text)
            draw.rectangle(
                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))

        if 'gt_keypoint' in sample.keys():
            gt_keypoint = sample['gt_keypoint']
            if self.is_normalized:
                for i in range(gt_keypoint.shape[1]):
                    if i % 2:
                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
                    else:
                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
            for i in range(gt_keypoint.shape[0]):
                keypoint = gt_keypoint[i]
                for j in range(int(keypoint.shape[0] / 2)):
                    x1 = round(keypoint[2 * j]).astype(np.int32)
                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
                    draw.ellipse(
                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
        save_path = os.path.join(self.output_dir, out_file_name)
        image.save(save_path, quality=95)
        return sample


@register_op
class Pad(BaseOperator):
    def __init__(self,
                 size=None,
                 size_divisor=32,
                 pad_mode=0,
                 offsets=None,
                 fill_value=(127.5, 127.5, 127.5)):
        """
        Pad image to a specified size or multiple of size_divisor.
        Args:
            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
            size_divisor (int): size divisor, default 32
            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
        """
        super(Pad, self).__init__()

        if not isinstance(size, (int, Sequence)):
            raise TypeError(
                "Type of target_size is invalid when random_size is True. \
                            Must be List, now is {}".format(type(size)))

        if isinstance(size, int):
            size = [size, size]

        assert pad_mode in [
            -1, 0, 1, 2
        ], 'currently only supports four modes [-1, 0, 1, 2]'
        if pad_mode == -1:
            assert offsets, 'if pad_mode is -1, offsets should not be None'

        self.size = size
        self.size_divisor = size_divisor
        self.pad_mode = pad_mode
        self.fill_value = fill_value
        self.offsets = offsets

    def apply_segm(self, segms, offsets, im_size, size):
        def _expand_poly(poly, x, y):
            expanded_poly = np.array(poly)
            expanded_poly[0::2] += x
            expanded_poly[1::2] += y
            return expanded_poly.tolist()

        def _expand_rle(rle, x, y, height, width, h, w):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, height, width)
            mask = mask_util.decode(rle)
            expanded_mask = np.full((h, w), 0).astype(mask.dtype)
            expanded_mask[y:y + height, x:x + width] = mask
            rle = mask_util.encode(
                np.array(
                    expanded_mask, order='F', dtype=np.uint8))
            return rle

        x, y = offsets
        height, width = im_size
        h, w = size
        expanded_segms = []
        for segm in segms:
            if is_poly(segm):
                # Polygon format
                expanded_segms.append(
                    [_expand_poly(poly, x, y) for poly in segm])
            else:
                # RLE format
                import pycocotools.mask as mask_util
                expanded_segms.append(
                    _expand_rle(segm, x, y, height, width, h, w))
        return expanded_segms

    def apply_bbox(self, bbox, offsets):
        return bbox + np.array(offsets * 2, dtype=np.float32)

    def apply_keypoint(self, keypoints, offsets):
        n = len(keypoints[0]) // 2
        return keypoints + np.array(offsets * n, dtype=np.float32)

    def apply_image(self, image, offsets, im_size, size):
        x, y = offsets
        im_h, im_w = im_size
        h, w = size
        canvas = np.ones((h, w, 3), dtype=np.float32)
        canvas *= np.array(self.fill_value, dtype=np.float32)
        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
        return canvas

    def apply(self, sample, context=None):
        im = sample['image']
        im_h, im_w = im.shape[:2]
        if self.size:
            h, w = self.size
            assert (
                im_h <= h and im_w <= w
            ), '(h, w) of target size should be greater than (im_h, im_w)'
        else:
            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)

        if h == im_h and w == im_w:
            sample['image'] = im.astype(np.float32)
            return sample

        if self.pad_mode == -1:
            offset_x, offset_y = self.offsets
        elif self.pad_mode == 0:
            offset_y, offset_x = 0, 0
        elif self.pad_mode == 1:
            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
        else:
            offset_y, offset_x = h - im_h, w - im_w

        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]

        sample['image'] = self.apply_image(im, offsets, im_size, size)

        if self.pad_mode == 0:
            return sample
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)

        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
                                                im_size, size)

        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
                                                        offsets)

        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            masks = [
                cv2.copyMakeBorder(
                    gt_segm,
                    offset_y, h - (offset_y + im_h),
                    offset_x, w - (offset_x + im_w),
                    borderType=cv2.BORDER_CONSTANT,
                    value=0)
                for gt_segm in sample['gt_segm']
            ]
            sample['gt_segm'] = np.asarray(masks, dtype=np.uint8)

        return sample


@register_op
class Poly2Mask(BaseOperator):
    """
    gt poly to mask annotations.
    Args:
        del_poly (bool): Whether to delete poly after generating mask. Default: False.
    """

    def __init__(self, del_poly=False):
        super(Poly2Mask, self).__init__()
        import pycocotools.mask as maskUtils
        self.maskutils = maskUtils
        self.del_poly = del_poly

    def _poly2mask(self, mask_ann, img_h, img_w):
        if isinstance(mask_ann, list):
            # polygon -- a single object might consist of multiple parts
            # we merge all parts into one mask rle code
            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
            rle = self.maskutils.merge(rles)
        elif isinstance(mask_ann['counts'], list):
            # uncompressed RLE
            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
        else:
            # rle
            rle = mask_ann
        mask = self.maskutils.decode(rle)
        return mask

    def apply(self, sample, context=None):
        assert 'gt_poly' in sample
        im_h, im_w = sample['im_shape']
        masks = [
            self._poly2mask(gt_poly, im_h, im_w)
            for gt_poly in sample['gt_poly']
        ]
        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
        if self.del_poly:
            del (sample['gt_poly'])

        return sample


@register_op
class AugmentHSV(BaseOperator):
    """
    Augment the SV channel of image data.
    Args:
        fraction (float): the fraction for augment. Default: 0.5.
        is_bgr (bool): whether the image is BGR mode. Default: True.
        hgain (float): H channel gains
        sgain (float): S channel gains
        vgain (float): V channel gains
    """

    def __init__(self,
                 fraction=0.50,
                 is_bgr=True,
                 hgain=None,
                 sgain=None,
                 vgain=None):
        super(AugmentHSV, self).__init__()
        self.fraction = fraction
        self.is_bgr = is_bgr
        self.hgain = hgain
        self.sgain = sgain
        self.vgain = vgain
        self.use_hsvgain = False if hgain is None else True

    def apply(self, sample, context=None):
        img = sample['image']
        if self.is_bgr:
            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        else:
            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)

        if self.use_hsvgain:
            hsv_augs = np.random.uniform(
                -1, 1, 3) * [self.hgain, self.sgain, self.vgain]
            # random selection of h, s, v
            hsv_augs *= np.random.randint(0, 2, 3)
            img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
            img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
            img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)

        else:
            S = img_hsv[:, :, 1].astype(np.float32)
            V = img_hsv[:, :, 2].astype(np.float32)

            a = (random.random() * 2 - 1) * self.fraction + 1
            S *= a
            if a > 1:
                np.clip(S, a_min=0, a_max=255, out=S)

            a = (random.random() * 2 - 1) * self.fraction + 1
            V *= a
            if a > 1:
                np.clip(V, a_min=0, a_max=255, out=V)

            img_hsv[:, :, 1] = S.astype(np.uint8)
            img_hsv[:, :, 2] = V.astype(np.uint8)

        if self.is_bgr:
            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
        else:
            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)

        sample['image'] = img.astype(np.float32)
        return sample


@register_op
class Norm2PixelBbox(BaseOperator):
    """
    Transform the bounding box's coornidates which is in [0,1] to pixels.
    """

    def __init__(self):
        super(Norm2PixelBbox, self).__init__()

    def apply(self, sample, context=None):
        assert 'gt_bbox' in sample
        bbox = sample['gt_bbox']
        height, width = sample['image'].shape[:2]
        bbox[:, 0::2] = bbox[:, 0::2] * width
        bbox[:, 1::2] = bbox[:, 1::2] * height
        sample['gt_bbox'] = bbox
        return sample


@register_op
class BboxCXCYWH2XYXY(BaseOperator):
    """
    Convert bbox CXCYWH format to XYXY format.
    [center_x, center_y, width, height] -> [x0, y0, x1, y1]
    """

    def __init__(self):
        super(BboxCXCYWH2XYXY, self).__init__()

    def apply(self, sample, context=None):
        assert 'gt_bbox' in sample
        bbox0 = sample['gt_bbox']
        bbox = bbox0.copy()

        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
        sample['gt_bbox'] = bbox
        return sample


@register_op
class RandomResizeCrop(BaseOperator):
    """Random resize and crop image and bboxes.
    Args:
        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
        'long', resize the image's long side to the maximum of target_size, if keep_ratio is
        True and mode is 'short', resize the image's short side to the minimum of target_size.
        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
        mode (str): resize mode, `long` or `short`. Details see resizes.
        prob (float): probability of this op.
        keep_ratio (bool): whether keep_ratio or not, default true
        interp (int): the interpolation method
        thresholds (list): iou thresholds for decide a valid bbox crop.
        num_attempts (int): number of tries before giving up.
        allow_no_crop (bool): allow return without actually cropping them.
        cover_all_box (bool): ensure all bboxes are covered in the final crop.
        is_mask_crop(bool): whether crop the segmentation.
    """

    def __init__(self,
                 resizes,
                 cropsizes,
                 prob=0.5,
                 mode='short',
                 keep_ratio=True,
                 interp=cv2.INTER_LINEAR,
                 num_attempts=3,
                 cover_all_box=False,
                 allow_no_crop=False,
                 thresholds=[0.3, 0.5, 0.7],
                 is_mask_crop=False,
                 ioumode="iou"):
        super(RandomResizeCrop, self).__init__()

        self.resizes = resizes
        self.cropsizes = cropsizes
        self.prob = prob
        self.mode = mode
        self.ioumode = ioumode

        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
        self.croper = RandomCrop(
            num_attempts=num_attempts,
            cover_all_box=cover_all_box,
            thresholds=thresholds,
            allow_no_crop=allow_no_crop,
            is_mask_crop=is_mask_crop)

    def _format_size(self, size):
        if isinstance(size, Integral):
            size = (size, size)
        return size

    def apply(self, sample, context=None):
        if random.random() < self.prob:
            _resize = self._format_size(random.choice(self.resizes))
            _cropsize = self._format_size(random.choice(self.cropsizes))
            sample = self._resize(
                self.resizer,
                sample,
                size=_resize,
                mode=self.mode,
                context=context)
            sample = self._random_crop(
                self.croper, sample, size=_cropsize, context=context)
        return sample

    @staticmethod
    def _random_crop(croper, sample, size, context=None):
        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
            return sample

        self = croper
        h, w = sample['image'].shape[:2]
        gt_bbox = sample['gt_bbox']
        cropsize = size
        min_crop = min(cropsize)
        max_crop = max(cropsize)

        thresholds = list(self.thresholds)
        np.random.shuffle(thresholds)

        for thresh in thresholds:
            found = False
            for _ in range(self.num_attempts):

                crop_h = random.randint(min_crop, min(h, max_crop))
                crop_w = random.randint(min_crop, min(w, max_crop))

                crop_y = random.randint(0, h - crop_h)
                crop_x = random.randint(0, w - crop_w)

                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
                if self.ioumode == "iof":
                    iou = self._gtcropiou_matrix(
                        gt_bbox, np.array(
                            [crop_box], dtype=np.float32))
                elif self.ioumode == "iou":
                    iou = self._iou_matrix(
                        gt_bbox, np.array(
                            [crop_box], dtype=np.float32))
                if iou.max() < thresh:
                    continue

                if self.cover_all_box and iou.min() < thresh:
                    continue

                cropped_box, valid_ids = self._crop_box_with_center_constraint(
                    gt_bbox, np.array(
                        crop_box, dtype=np.float32))
                if valid_ids.size > 0:
                    found = True
                    break

            if found:
                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
                        'gt_poly']) > 0:
                    crop_polys = self.crop_segms(
                        sample['gt_poly'],
                        valid_ids,
                        np.array(
                            crop_box, dtype=np.int64),
                        h,
                        w)
                    if [] in crop_polys:
                        delete_id = list()
                        valid_polys = list()
                        for id, crop_poly in enumerate(crop_polys):
                            if crop_poly == []:
                                delete_id.append(id)
                            else:
                                valid_polys.append(crop_poly)
                        valid_ids = np.delete(valid_ids, delete_id)
                        if len(valid_polys) == 0:
                            return sample
                        sample['gt_poly'] = valid_polys
                    else:
                        sample['gt_poly'] = crop_polys

                if 'gt_segm' in sample:
                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
                                                        crop_box)
                    sample['gt_segm'] = np.take(
                        sample['gt_segm'], valid_ids, axis=0)

                sample['image'] = self._crop_image(sample['image'], crop_box)
                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
                sample['gt_class'] = np.take(
                    sample['gt_class'], valid_ids, axis=0)
                if 'gt_score' in sample:
                    sample['gt_score'] = np.take(
                        sample['gt_score'], valid_ids, axis=0)

                if 'is_crowd' in sample:
                    sample['is_crowd'] = np.take(
                        sample['is_crowd'], valid_ids, axis=0)

                if 'gt_areas' in sample:
                    sample['gt_areas'] = np.take(
                        sample['gt_areas'], valid_ids, axis=0)

                if 'gt_joints' in sample:
                    gt_joints = self._crop_joints(sample['gt_joints'], crop_box)
                    sample['gt_joints'] = gt_joints[valid_ids]
                return sample

        return sample

    @staticmethod
    def _resize(resizer, sample, size, mode='short', context=None):
        self = resizer
        im = sample['image']
        target_size = size

        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))
        if len(im.shape) != 3:
            raise ImageError('{}: image is not 3-dimensional.'.format(self))

        # apply image
        im_shape = im.shape
        if self.keep_ratio:

            im_size_min = np.min(im_shape[0:2])
            im_size_max = np.max(im_shape[0:2])

            target_size_min = np.min(target_size)
            target_size_max = np.max(target_size)

            if mode == 'long':
                im_scale = min(target_size_min / im_size_min,
                               target_size_max / im_size_max)
            else:
                im_scale = max(target_size_min / im_size_min,
                               target_size_max / im_size_max)

            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
            resize_w = int(im_scale * float(im_shape[1]) + 0.5)

            im_scale_x = im_scale
            im_scale_y = im_scale
        else:
            resize_h, resize_w = target_size
            im_scale_y = resize_h / im_shape[0]
            im_scale_x = resize_w / im_shape[1]

        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
        sample['image'] = im
        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
        if 'scale_factor' in sample:
            scale_factor = sample['scale_factor']
            sample['scale_factor'] = np.asarray(
                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
                dtype=np.float32)
        else:
            sample['scale_factor'] = np.asarray(
                [im_scale_y, im_scale_x], dtype=np.float32)

        # apply bbox
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
                                                [im_scale_x, im_scale_y],
                                                [resize_w, resize_h])

        # apply polygon
        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
                                                [im_scale_x, im_scale_y])

        # apply semantic
        if 'semantic' in sample and sample['semantic']:
            semantic = sample['semantic']
            semantic = cv2.resize(
                semantic.astype('float32'),
                None,
                None,
                fx=im_scale_x,
                fy=im_scale_y,
                interpolation=self.interp)
            semantic = np.asarray(semantic).astype('int32')
            semantic = np.expand_dims(semantic, 0)
            sample['semantic'] = semantic

        # apply gt_segm
        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            masks = [
                cv2.resize(
                    gt_segm,
                    None,
                    None,
                    fx=im_scale_x,
                    fy=im_scale_y,
                    interpolation=cv2.INTER_NEAREST)
                for gt_segm in sample['gt_segm']
            ]
            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)

        if 'gt_joints' in sample:
            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
                                                    [im_scale_x, im_scale_y],
                                                    [resize_w, resize_h])

        return sample


@register_op
class RandomSelect(BaseOperator):
    """
    Randomly choose a transformation between transforms1 and transforms2,
    and the probability of choosing transforms1 is p.

    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py

    """

    def __init__(self, transforms1, transforms2, p=0.5):
        super(RandomSelect, self).__init__()
        self.transforms1 = Compose(transforms1)
        self.transforms2 = Compose(transforms2)
        self.p = p

    def apply(self, sample, context=None):
        if random.random() < self.p:
            return self.transforms1(sample)
        return self.transforms2(sample)


@register_op
class RandomSelects(BaseOperator):
    """
    Randomly choose a transformation between transforms1 and transforms2,
    and the probability of choosing transforms1 is p.

    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py

    """

    def __init__(self, transforms_list, p=None):
        super(RandomSelects, self).__init__()
        if p is not None:
            assert isinstance(p, (list, tuple))
            assert len(transforms_list) == len(p)
        else:
            assert len(transforms_list) > 0
        self.transforms = [Compose(t) for t in transforms_list]
        self.p = p

    def apply(self, sample, context=None):
        if self.p is None:
            return random.choice(self.transforms)(sample)
        else:
            prob = random.random()
            for p, t in zip(self.p, self.transforms):
                if prob <= p:
                    return t(sample)


@register_op
class RandomShortSideResize(BaseOperator):
    def __init__(self,
                 short_side_sizes,
                 max_size=None,
                 interp=cv2.INTER_LINEAR,
                 random_interp=False):
        """
        Resize the image randomly according to the short side. If max_size is not None,
        the long side is scaled according to max_size. The whole process will be keep ratio.
        Args:
            short_side_sizes (list|tuple): Image target short side size.
            max_size (int): The size of the longest side of image after resize.
            interp (int): The interpolation method.
            random_interp (bool): Whether random select interpolation method.
        """
        super(RandomShortSideResize, self).__init__()

        assert isinstance(short_side_sizes,
                          Sequence), "short_side_sizes must be List or Tuple"

        self.short_side_sizes = short_side_sizes
        self.max_size = max_size
        self.interp = interp
        self.random_interp = random_interp
        self.interps = [
            cv2.INTER_NEAREST,
            cv2.INTER_LINEAR,
            cv2.INTER_AREA,
            cv2.INTER_CUBIC,
            cv2.INTER_LANCZOS4,
        ]

    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
        h, w = image_shape
        max_clip = False
        if max_size is not None:
            min_original_size = float(min((w, h)))
            max_original_size = float(max((w, h)))
            if max_original_size / min_original_size * size > max_size:
                size = int(max_size * min_original_size / max_original_size)
                max_clip = True

        if (w <= h and w == size) or (h <= w and h == size):
            return (w, h)

        if w < h:
            ow = size
            oh = int(round(size * h / w)) if not max_clip else max_size
        else:
            oh = size
            ow = int(round(size * w / h)) if not max_clip else max_size

        return (ow, oh)

    def resize(self,
               sample,
               target_size,
               max_size=None,
               interp=cv2.INTER_LINEAR):
        im = sample['image']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))
        if len(im.shape) != 3:
            raise ImageError('{}: image is not 3-dimensional.'.format(self))

        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
                                                      max_size)
        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
            0] / im.shape[1]

        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
        if 'scale_factor' in sample:
            scale_factor = sample['scale_factor']
            sample['scale_factor'] = np.asarray(
                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
                dtype=np.float32)
        else:
            sample['scale_factor'] = np.asarray(
                [im_scale_y, im_scale_x], dtype=np.float32)

        # apply bbox
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_bbox(
                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
        # apply polygon
        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
                                                [im_scale_x, im_scale_y])
        # apply semantic
        if 'semantic' in sample and sample['semantic']:
            semantic = sample['semantic']
            semantic = cv2.resize(
                semantic.astype('float32'),
                target_size,
                interpolation=self.interp)
            semantic = np.asarray(semantic).astype('int32')
            semantic = np.expand_dims(semantic, 0)
            sample['semantic'] = semantic
        # apply gt_segm
        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            masks = [
                cv2.resize(
                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
                for gt_segm in sample['gt_segm']
            ]
            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)

        if 'gt_joints' in sample:
            sample['gt_joints'] = self.apply_joints(
                sample['gt_joints'], [im_scale_x, im_scale_y], target_size)

        # apply areas
        if 'gt_areas' in sample:
            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
                                                 [im_scale_x, im_scale_y])

        return sample

    def apply_bbox(self, bbox, scale, size):
        im_scale_x, im_scale_y = scale
        resize_w, resize_h = size
        bbox[:, 0::2] *= im_scale_x
        bbox[:, 1::2] *= im_scale_y
        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
        return bbox.astype('float32')

    def apply_joints(self, joints, scale, size):
        im_scale_x, im_scale_y = scale
        resize_w, resize_h = size
        joints[..., 0] *= im_scale_x
        joints[..., 1] *= im_scale_y
        # joints[joints[..., 0] >= resize_w, :] = 0
        # joints[joints[..., 1] >= resize_h, :] = 0
        # joints[joints[..., 0] < 0, :] = 0
        # joints[joints[..., 1] < 0, :] = 0
        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
        return joints

    def apply_area(self, area, scale):
        im_scale_x, im_scale_y = scale
        return area * im_scale_x * im_scale_y

    def apply_segm(self, segms, im_size, scale):
        def _resize_poly(poly, im_scale_x, im_scale_y):
            resized_poly = np.array(poly).astype('float32')
            resized_poly[0::2] *= im_scale_x
            resized_poly[1::2] *= im_scale_y
            return resized_poly.tolist()

        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, im_h, im_w)

            mask = mask_util.decode(rle)
            mask = cv2.resize(
                mask,
                None,
                None,
                fx=im_scale_x,
                fy=im_scale_y,
                interpolation=self.interp)
            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
            return rle

        im_h, im_w = im_size
        im_scale_x, im_scale_y = scale
        resized_segms = []
        for segm in segms:
            if is_poly(segm):
                # Polygon format
                resized_segms.append([
                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
                ])
            else:
                # RLE format
                import pycocotools.mask as mask_util
                resized_segms.append(
                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))

        return resized_segms

    def apply(self, sample, context=None):
        target_size = random.choice(self.short_side_sizes)
        interp = random.choice(
            self.interps) if self.random_interp else self.interp

        return self.resize(sample, target_size, self.max_size, interp)


@register_op
class RandomShortSideRangeResize(RandomShortSideResize):
    def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):
        """
        Resize the image randomly according to the short side. If max_size is not None,
        the long side is scaled according to max_size. The whole process will be keep ratio.
        Args:
            short_side_sizes (list|tuple): Image target short side size.
            interp (int): The interpolation method.
            random_interp (bool): Whether random select interpolation method.
        """
        super(RandomShortSideRangeResize, self).__init__(scales, None, interp,
                                                         random_interp)

        assert isinstance(scales,
                          Sequence), "short_side_sizes must be List or Tuple"

        self.scales = scales

    def random_sample(self, img_scales):
        img_scale_long = [max(s) for s in img_scales]
        img_scale_short = [min(s) for s in img_scales]
        long_edge = np.random.randint(
            min(img_scale_long), max(img_scale_long) + 1)
        short_edge = np.random.randint(
            min(img_scale_short), max(img_scale_short) + 1)
        img_scale = (long_edge, short_edge)
        return img_scale

    def apply(self, sample, context=None):
        long_edge, short_edge = self.random_sample(self.short_side_sizes)
        # print("target size:{}".format((long_edge, short_edge)))
        interp = random.choice(
            self.interps) if self.random_interp else self.interp

        return self.resize(sample, short_edge, long_edge, interp)


@register_op
class RandomSizeCrop(BaseOperator):
    """
    Cut the image randomly according to `min_size` and `max_size`
    Args:
        min_size (int): Min size for edges of cropped image.
        max_size (int): Max size for edges of cropped image. If it
                        is set to larger than length of the input image,
                        the output will keep the origin length.
        keep_empty (bool): Whether to keep the cropped result with no object.
                           If it is set to False, the no-object result will not
                           be returned, replaced by the original input.
    """

    def __init__(self, min_size, max_size, keep_empty=True):
        super(RandomSizeCrop, self).__init__()
        self.min_size = min_size
        self.max_size = max_size
        self.keep_empty = keep_empty

        from paddle.vision.transforms.functional import crop as paddle_crop
        self.paddle_crop = paddle_crop

    @staticmethod
    def get_crop_params(img_shape, output_size):
        """Get parameters for ``crop`` for a random crop.
        Args:
            img_shape (list|tuple): Image's height and width.
            output_size (list|tuple): Expected output size of the crop.
        Returns:
            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
        """
        h, w = img_shape
        th, tw = output_size

        if h + 1 < th or w + 1 < tw:
            raise ValueError(
                "Required crop size {} is larger then input image size {}".
                format((th, tw), (h, w)))

        if w == tw and h == th:
            return 0, 0, h, w

        i = random.randint(0, h - th + 1)
        j = random.randint(0, w - tw + 1)
        return i, j, th, tw

    def crop(self, sample, region):
        keep_index = None
        # apply bbox and check whether the cropped result is valid
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            croped_bbox = self.apply_bbox(sample['gt_bbox'], region)
            bbox = croped_bbox.reshape([-1, 2, 2])
            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
            keep_index = np.where(area > 0)[0]

            if not self.keep_empty and len(keep_index) == 0:
                # When keep_empty is set to False, cropped with no-object will
                # not be used and return the origin content.
                return sample

            sample['gt_bbox'] = croped_bbox[keep_index] if len(
                keep_index) > 0 else np.zeros(
                    [0, 4], dtype=np.float32)
            sample['gt_class'] = sample['gt_class'][keep_index] if len(
                keep_index) > 0 else np.zeros(
                    [0, 1], dtype=np.float32)
            if 'gt_score' in sample:
                sample['gt_score'] = sample['gt_score'][keep_index] if len(
                    keep_index) > 0 else np.zeros(
                        [0, 1], dtype=np.float32)
            if 'is_crowd' in sample:
                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
                    keep_index) > 0 else np.zeros(
                        [0, 1], dtype=np.float32)
            if 'gt_areas' in sample:
                sample['gt_areas'] = np.take(
                    sample['gt_areas'], keep_index, axis=0)

        image_shape = sample['image'].shape[:2]
        sample['image'] = self.paddle_crop(sample['image'], *region)
        sample['im_shape'] = np.array(
            sample['image'].shape[:2], dtype=np.float32)

        # apply polygon
        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
                                                image_shape)
            sample['gt_poly'] = np.array(sample['gt_poly'])
            if keep_index is not None and len(keep_index) > 0:
                sample['gt_poly'] = sample['gt_poly'][keep_index]
            sample['gt_poly'] = sample['gt_poly'].tolist()
        # apply gt_segm
        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
            i, j, h, w = region
            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
            if keep_index is not None and len(keep_index) > 0:
                sample['gt_segm'] = sample['gt_segm'][keep_index]

        if 'gt_joints' in sample:
            gt_joints = self._crop_joints(sample['gt_joints'], region)
            sample['gt_joints'] = gt_joints
            if keep_index is not None:
                sample['gt_joints'] = sample['gt_joints'][keep_index]

        return sample

    def apply_bbox(self, bbox, region):
        i, j, h, w = region
        region_size = np.asarray([w, h])
        crop_bbox = bbox - np.asarray([j, i, j, i])
        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
        crop_bbox = crop_bbox.clip(min=0)
        return crop_bbox.reshape([-1, 4]).astype('float32')

    def _crop_joints(self, joints, region):
        y1, x1, h, w = region
        x2 = x1 + w
        y2 = y1 + h
        # x1, y1, x2, y2 = crop
        joints[..., 0] -= x1
        joints[..., 1] -= y1
        joints[joints[..., 0] > w, :] = 0
        joints[joints[..., 1] > h, :] = 0
        joints[joints[..., 0] < 0, :] = 0
        joints[joints[..., 1] < 0, :] = 0
        return joints

    def apply_segm(self, segms, region, image_shape):
        def _crop_poly(segm, crop):
            xmin, ymin, xmax, ymax = crop
            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
            crop_p = np.array(crop_coord).reshape(4, 2)
            crop_p = Polygon(crop_p)

            crop_segm = list()
            for poly in segm:
                poly = np.array(poly).reshape(len(poly) // 2, 2)
                polygon = Polygon(poly)
                if not polygon.is_valid:
                    exterior = polygon.exterior
                    multi_lines = exterior.intersection(exterior)
                    polygons = shapely.ops.polygonize(multi_lines)
                    polygon = MultiPolygon(polygons)
                multi_polygon = list()
                if isinstance(polygon, MultiPolygon):
                    multi_polygon = copy.deepcopy(polygon)
                else:
                    multi_polygon.append(copy.deepcopy(polygon))
                for per_polygon in multi_polygon:
                    inter = per_polygon.intersection(crop_p)
                    if not inter:
                        continue
                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
                        for part in inter:
                            if not isinstance(part, Polygon):
                                continue
                            part = np.squeeze(
                                np.array(part.exterior.coords[:-1]).reshape(1,
                                                                            -1))
                            part[0::2] -= xmin
                            part[1::2] -= ymin
                            crop_segm.append(part.tolist())
                    elif isinstance(inter, Polygon):
                        crop_poly = np.squeeze(
                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
                        crop_poly[0::2] -= xmin
                        crop_poly[1::2] -= ymin
                        crop_segm.append(crop_poly.tolist())
                    else:
                        continue
            return crop_segm

        def _crop_rle(rle, crop, height, width):
            if 'counts' in rle and type(rle['counts']) == list:
                rle = mask_util.frPyObjects(rle, height, width)
            mask = mask_util.decode(rle)
            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
            return rle

        i, j, h, w = region
        crop = [j, i, j + w, i + h]
        height, width = image_shape
        crop_segms = []
        for segm in segms:
            if is_poly(segm):
                import copy
                import shapely.ops
                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
                # Polygon format
                crop_segms.append(_crop_poly(segm, crop))
            else:
                # RLE format
                import pycocotools.mask as mask_util
                crop_segms.append(_crop_rle(segm, crop, height, width))
        return crop_segms

    def apply(self, sample, context=None):
        h = random.randint(self.min_size,
                           min(sample['image'].shape[0], self.max_size))
        w = random.randint(self.min_size,
                           min(sample['image'].shape[1], self.max_size))

        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
        return self.crop(sample, region)


@register_op
class WarpAffine(BaseOperator):
    def __init__(self,
                 keep_res=False,
                 pad=31,
                 input_h=512,
                 input_w=512,
                 scale=0.4,
                 shift=0.1,
                 down_ratio=4):
        """WarpAffine
        Warp affine the image
        The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py
        """
        super(WarpAffine, self).__init__()
        self.keep_res = keep_res
        self.pad = pad
        self.input_h = input_h
        self.input_w = input_w
        self.scale = scale
        self.shift = shift
        self.down_ratio = down_ratio

    def apply(self, sample, context=None):
        img = sample['image']
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        h, w = img.shape[:2]

        if self.keep_res:
            # True in detection eval/infer
            input_h = (h | self.pad) + 1
            input_w = (w | self.pad) + 1
            s = np.array([input_w, input_h], dtype=np.float32)
            c = np.array([w // 2, h // 2], dtype=np.float32)
        else:
            # False in centertrack eval_mot/eval_mot
            s = max(h, w) * 1.0
            input_h, input_w = self.input_h, self.input_w
            c = np.array([w / 2., h / 2.], dtype=np.float32)

        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
        img = cv2.resize(img, (w, h))
        inp = cv2.warpAffine(
            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
        sample['image'] = inp

        if not self.keep_res:
            out_h = input_h // self.down_ratio
            out_w = input_w // self.down_ratio
            trans_output = get_affine_transform(c, s, 0, [out_w, out_h])

            sample.update({
                'center': c,
                'scale': s,
                'out_height': out_h,
                'out_width': out_w,
                'inp_height': input_h,
                'inp_width': input_w,
                'trans_input': trans_input,
                'trans_output': trans_output,
            })
        return sample


@register_op
class FlipWarpAffine(BaseOperator):
    def __init__(self,
                 keep_res=False,
                 pad=31,
                 input_h=512,
                 input_w=512,
                 not_rand_crop=False,
                 scale=0.4,
                 shift=0.1,
                 flip=0.5,
                 is_scale=True,
                 use_random=True,
                 add_pre_img=False):
        """FlipWarpAffine
        1. Random Crop
        2. Flip the image horizontal
        3. Warp affine the image
        4. (Optinal) Add previous image
        """
        super(FlipWarpAffine, self).__init__()
        self.keep_res = keep_res
        self.pad = pad
        self.input_h = input_h
        self.input_w = input_w
        self.not_rand_crop = not_rand_crop
        self.scale = scale
        self.shift = shift
        self.flip = flip
        self.is_scale = is_scale
        self.use_random = use_random
        self.add_pre_img = add_pre_img

    def __call__(self, samples, context=None):
        if self.add_pre_img:
            assert isinstance(samples, Sequence) and len(samples) == 2
            sample, pre_sample = samples[0], samples[1]
        else:
            sample = samples

        img = sample['image']
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
            return sample

        h, w = img.shape[:2]
        flipped = 0

        if self.keep_res:
            input_h = (h | self.pad) + 1
            input_w = (w | self.pad) + 1
            s = np.array([input_w, input_h], dtype=np.float32)
            c = np.array([w // 2, h // 2], dtype=np.float32)
        else:
            # centernet training default
            s = max(h, w) * 1.0
            input_h, input_w = self.input_h, self.input_w
            c = np.array([w / 2., h / 2.], dtype=np.float32)

        if self.use_random:
            gt_bbox = sample['gt_bbox']
            if not self.not_rand_crop:
                # centernet default
                s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
                w_border = get_border(128, w)
                h_border = get_border(128, h)
                c[0] = np.random.randint(low=w_border, high=w - w_border)
                c[1] = np.random.randint(low=h_border, high=h - h_border)
            else:
                sf = self.scale
                cf = self.shift
                c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
                c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)

            if np.random.random() < self.flip:
                img = img[:, ::-1, :]
                c[0] = w - c[0] - 1
                oldx1 = gt_bbox[:, 0].copy()
                oldx2 = gt_bbox[:, 2].copy()
                gt_bbox[:, 0] = w - oldx2 - 1
                gt_bbox[:, 2] = w - oldx1 - 1
                flipped = 1
            sample['gt_bbox'] = gt_bbox

        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
        inp = cv2.warpAffine(
            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
        if self.is_scale:
            inp = (inp.astype(np.float32) / 255.)

        sample['image'] = inp
        sample['center'] = c
        sample['scale'] = s

        if self.add_pre_img:
            sample['trans_input'] = trans_input

            # previous image, use same aug trans_input as current image
            pre_img = pre_sample['image']
            pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR)
            if flipped:
                pre_img = pre_img[:, ::-1, :].copy()
            pre_inp = cv2.warpAffine(
                pre_img,
                trans_input, (input_w, input_h),
                flags=cv2.INTER_LINEAR)
            if self.is_scale:
                pre_inp = (pre_inp.astype(np.float32) / 255.)
            sample['pre_image'] = pre_inp

            # if empty gt_bbox
            if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0:
                return sample
            pre_gt_bbox = pre_sample['gt_bbox']
            if flipped:
                pre_oldx1 = pre_gt_bbox[:, 0].copy()
                pre_oldx2 = pre_gt_bbox[:, 2].copy()
                pre_gt_bbox[:, 0] = w - pre_oldx1 - 1
                pre_gt_bbox[:, 2] = w - pre_oldx2 - 1
            sample['pre_gt_bbox'] = pre_gt_bbox

            sample['pre_gt_class'] = pre_sample['gt_class']
            sample['pre_gt_track_id'] = pre_sample['gt_track_id']
            del pre_sample

        return sample


@register_op
class CenterRandColor(BaseOperator):
    """Random color for CenterNet series models.
    Args:
        saturation (float): saturation settings.
        contrast (float): contrast settings.
        brightness (float): brightness settings.
    """

    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
        super(CenterRandColor, self).__init__()
        self.saturation = saturation
        self.contrast = contrast
        self.brightness = brightness

    def apply_saturation(self, img, img_gray):
        alpha = 1. + np.random.uniform(
            low=-self.saturation, high=self.saturation)
        self._blend(alpha, img, img_gray[:, :, None])
        return img

    def apply_contrast(self, img, img_gray):
        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
        img_mean = img_gray.mean()
        self._blend(alpha, img, img_mean)
        return img

    def apply_brightness(self, img, img_gray):
        alpha = 1 + np.random.uniform(
            low=-self.brightness, high=self.brightness)
        img *= alpha
        return img

    def _blend(self, alpha, img, img_mean):
        img *= alpha
        img_mean *= (1 - alpha)
        img += img_mean

    def apply(self, sample, context=None):
        functions = [
            self.apply_brightness,
            self.apply_contrast,
            self.apply_saturation,
        ]

        img = sample['image']
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        distortions = np.random.permutation(functions)
        for func in distortions:
            img = func(img, img_gray)
        sample['image'] = img

        if 'pre_image' in sample:
            pre_img = sample['pre_image']
            pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)
            pre_distortions = np.random.permutation(functions)
            for func in pre_distortions:
                pre_img = func(pre_img, pre_img_gray)
            sample['pre_image'] = pre_img

        return sample


@register_op
class Mosaic(BaseOperator):
    """ Mosaic operator for image and gt_bboxes
    The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py

    1. get mosaic coords
    2. clip bbox and get mosaic_labels
    3. random_affine augment
    4. Mixup augment as copypaste (optinal), not used in tiny/nano

    Args:
        prob (float): probability of using Mosaic, 1.0 as default
        input_dim (list[int]): input shape
        degrees (list[2]): the rotate range to apply, transform range is [min, max]
        translate (list[2]): the translate range to apply, transform range is [min, max]
        scale (list[2]): the scale range to apply, transform range is [min, max]
        shear (list[2]): the shear range to apply, transform range is [min, max]
        enable_mixup (bool): whether to enable Mixup or not
        mixup_prob (float): probability of using Mixup, 1.0 as default
        mixup_scale (list[int]): scale range of Mixup
        remove_outside_box (bool): whether remove outside boxes, False as
            default in COCO dataset, True in MOT dataset
    """

    def __init__(self,
                 prob=1.0,
                 input_dim=[640, 640],
                 degrees=[-10, 10],
                 translate=[-0.1, 0.1],
                 scale=[0.1, 2],
                 shear=[-2, 2],
                 enable_mixup=True,
                 mixup_prob=1.0,
                 mixup_scale=[0.5, 1.5],
                 remove_outside_box=False):
        super(Mosaic, self).__init__()
        self.prob = prob
        if isinstance(input_dim, Integral):
            input_dim = [input_dim, input_dim]
        self.input_dim = input_dim
        self.degrees = degrees
        self.translate = translate
        self.scale = scale
        self.shear = shear
        self.enable_mixup = enable_mixup
        self.mixup_prob = mixup_prob
        self.mixup_scale = mixup_scale
        self.remove_outside_box = remove_outside_box

    def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
        # (x1, y1, x2, y2) means coords in large image,
        # small_coords means coords in small image in mosaic aug.
        if mosaic_idx == 0:
            # top left
            x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
            small_coords = w - (x2 - x1), h - (y2 - y1), w, h
        elif mosaic_idx == 1:
            # top right
            x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
            small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
        elif mosaic_idx == 2:
            # bottom left
            x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
            small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
        elif mosaic_idx == 3:
            # bottom right
            x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
                                                                   yc + h)
            small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)

        return (x1, y1, x2, y2), small_coords

    def random_affine_augment(self,
                              img,
                              labels=[],
                              input_dim=[640, 640],
                              degrees=[-10, 10],
                              scales=[0.1, 2],
                              shears=[-2, 2],
                              translates=[-0.1, 0.1]):
        # random rotation and scale
        degree = random.uniform(degrees[0], degrees[1])
        scale = random.uniform(scales[0], scales[1])
        assert scale > 0, "Argument scale should be positive."
        R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
        M = np.ones([2, 3])

        # random shear
        shear = random.uniform(shears[0], shears[1])
        shear_x = math.tan(shear * math.pi / 180)
        shear_y = math.tan(shear * math.pi / 180)
        M[0] = R[0] + shear_y * R[1]
        M[1] = R[1] + shear_x * R[0]

        # random translation
        translate = random.uniform(translates[0], translates[1])
        translation_x = translate * input_dim[0]
        translation_y = translate * input_dim[1]
        M[0, 2] = translation_x
        M[1, 2] = translation_y

        # warpAffine
        img = cv2.warpAffine(
            img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))

        num_gts = len(labels)
        if num_gts > 0:
            # warp corner points
            corner_points = np.ones((4 * num_gts, 3))
            corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
                4 * num_gts, 2)  # x1y1, x2y2, x1y2, x2y1
            # apply affine transform
            corner_points = corner_points @M.T
            corner_points = corner_points.reshape(num_gts, 8)

            # create new boxes
            corner_xs = corner_points[:, 0::2]
            corner_ys = corner_points[:, 1::2]
            new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
                                         corner_xs.max(1), corner_ys.max(1)))
            new_bboxes = new_bboxes.reshape(4, num_gts).T

            # clip boxes
            new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
            new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
            labels[:, :4] = new_bboxes

        return img, labels

    def __call__(self, sample, context=None):
        if not isinstance(sample, Sequence):
            return sample

        assert len(
            sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
        if np.random.uniform(0., 1.) > self.prob:
            return sample[0]

        mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
        input_h, input_w = self.input_dim
        yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
        xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
        mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)

        # 1. get mosaic coords
        for mosaic_idx, sp in enumerate(sample[:4]):
            img = sp['image']
            gt_bbox = sp['gt_bbox']
            h0, w0 = img.shape[:2]
            scale = min(1. * input_h / h0, 1. * input_w / w0)
            img = cv2.resize(
                img, (int(w0 * scale), int(h0 * scale)),
                interpolation=cv2.INTER_LINEAR)
            (h, w, c) = img.shape[:3]

            # suffix l means large image, while s means small image in mosaic aug.
            (l_x1, l_y1, l_x2, l_y2), (
                s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
                    mosaic_idx, xc, yc, w, h, input_h, input_w)

            mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
            padw, padh = l_x1 - s_x1, l_y1 - s_y1

            # Normalized xywh to pixel xyxy format
            _gt_bbox = gt_bbox.copy()
            if len(gt_bbox) > 0:
                _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
                _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
                _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
                _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh

            mosaic_gt_bbox.append(_gt_bbox)
            mosaic_gt_class.append(sp['gt_class'])
            if 'is_crowd' in sp:
                mosaic_is_crowd.append(sp['is_crowd'])
            if 'difficult' in sp:
                mosaic_difficult.append(sp['difficult'])

        # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
        if len(mosaic_gt_bbox):
            mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
            mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
            if mosaic_is_crowd:
                mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
                mosaic_labels = np.concatenate([
                    mosaic_gt_bbox,
                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
                    mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
                ], 1)
            elif mosaic_difficult:
                mosaic_difficult = np.concatenate(mosaic_difficult, 0)
                mosaic_labels = np.concatenate([
                    mosaic_gt_bbox,
                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
                    mosaic_difficult.astype(mosaic_gt_bbox.dtype)
                ], 1)
            else:
                mosaic_labels = np.concatenate([
                    mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
                ], 1)
            if self.remove_outside_box:
                # for MOT dataset
                flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
                flag2 = mosaic_gt_bbox[:, 2] > 0
                flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
                flag4 = mosaic_gt_bbox[:, 3] > 0
                flag_all = flag1 * flag2 * flag3 * flag4
                mosaic_labels = mosaic_labels[flag_all]
            else:
                mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
                                              2 * input_w)
                mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
                                              2 * input_h)
                mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
                                              2 * input_w)
                mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
                                              2 * input_h)
        else:
            mosaic_labels = np.zeros((1, 6))

        # 3. random_affine augment
        mosaic_img, mosaic_labels = self.random_affine_augment(
            mosaic_img,
            mosaic_labels,
            input_dim=self.input_dim,
            degrees=self.degrees,
            translates=self.translate,
            scales=self.scale,
            shears=self.shear)

        # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
        # optinal, not used(enable_mixup=False) in tiny/nano
        if (self.enable_mixup and not len(mosaic_labels) == 0 and
                random.random() < self.mixup_prob):
            sample_mixup = sample[4]
            mixup_img = sample_mixup['image']
            if 'is_crowd' in sample_mixup:
                cp_labels = np.concatenate([
                    sample_mixup['gt_bbox'],
                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
                    sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
                ], 1)
            elif 'difficult' in sample_mixup:
                cp_labels = np.concatenate([
                    sample_mixup['gt_bbox'],
                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
                    sample_mixup['difficult'].astype(mosaic_labels.dtype)
                ], 1)
            else:
                cp_labels = np.concatenate([
                    sample_mixup['gt_bbox'],
                    sample_mixup['gt_class'].astype(mosaic_labels.dtype)
                ], 1)
            mosaic_img, mosaic_labels = self.mixup_augment(
                mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)

        sample0 = sample[0]
        sample0['image'] = mosaic_img.astype(np.uint8)  # can not be float32
        sample0['h'] = float(mosaic_img.shape[0])
        sample0['w'] = float(mosaic_img.shape[1])
        sample0['im_shape'][0] = sample0['h']
        sample0['im_shape'][1] = sample0['w']
        sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
        sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
        if 'is_crowd' in sample[0]:
            sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
        if 'difficult' in sample[0]:
            sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
        return sample0

    def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
                      img):
        jit_factor = random.uniform(*self.mixup_scale)
        FLIP = random.uniform(0, 1) > 0.5
        if len(img.shape) == 3:
            cp_img = np.ones(
                (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
        else:
            cp_img = np.ones(input_dim, dtype=np.uint8) * 114

        cp_scale_ratio = min(input_dim[0] / img.shape[0],
                             input_dim[1] / img.shape[1])
        resized_img = cv2.resize(
            img, (int(img.shape[1] * cp_scale_ratio),
                  int(img.shape[0] * cp_scale_ratio)),
            interpolation=cv2.INTER_LINEAR)

        cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
            1] * cp_scale_ratio)] = resized_img

        cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
                                     int(cp_img.shape[0] * jit_factor)))
        cp_scale_ratio *= jit_factor

        if FLIP:
            cp_img = cp_img[:, ::-1, :]

        origin_h, origin_w = cp_img.shape[:2]
        target_h, target_w = origin_img.shape[:2]
        padded_img = np.zeros(
            (max(origin_h, target_h), max(origin_w, target_w), 3),
            dtype=np.uint8)
        padded_img[:origin_h, :origin_w] = cp_img

        x_offset, y_offset = 0, 0
        if padded_img.shape[0] > target_h:
            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
        if padded_img.shape[1] > target_w:
            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
        padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
                                        x_offset + target_w]

        # adjust boxes
        cp_bboxes_origin_np = cp_labels[:, :4].copy()
        cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
                                               cp_scale_ratio, 0, origin_w)
        cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
                                               cp_scale_ratio, 0, origin_h)

        if FLIP:
            cp_bboxes_origin_np[:, 0::2] = (
                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
        if self.remove_outside_box:
            # for MOT dataset
            cp_bboxes_transformed_np[:, 0::2] -= x_offset
            cp_bboxes_transformed_np[:, 1::2] -= y_offset
        else:
            cp_bboxes_transformed_np[:, 0::2] = np.clip(
                cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
            cp_bboxes_transformed_np[:, 1::2] = np.clip(
                cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)

        cls_labels = cp_labels[:, 4:5].copy()
        box_labels = cp_bboxes_transformed_np
        if cp_labels.shape[-1] == 6:
            crd_labels = cp_labels[:, 5:6].copy()
            labels = np.hstack((box_labels, cls_labels, crd_labels))
        else:
            labels = np.hstack((box_labels, cls_labels))
        if self.remove_outside_box:
            labels = labels[labels[:, 0] < target_w]
            labels = labels[labels[:, 2] > 0]
            labels = labels[labels[:, 1] < target_h]
            labels = labels[labels[:, 3] > 0]

        origin_labels = np.vstack((origin_labels, labels))
        origin_img = origin_img.astype(np.float32)
        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
            np.float32)

        return origin_img.astype(np.uint8), origin_labels


@register_op
class PadResize(BaseOperator):
    """ PadResize for image and gt_bbbox

    Args:
        target_size (list[int]): input shape
        fill_value (float): pixel value of padded image
    """

    def __init__(self, target_size, fill_value=114):
        super(PadResize, self).__init__()
        if isinstance(target_size, Integral):
            target_size = [target_size, target_size]
        self.target_size = target_size
        self.fill_value = fill_value

    def _resize(self, img, bboxes, labels):
        ratio = min(self.target_size[0] / img.shape[0],
                    self.target_size[1] / img.shape[1])
        w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
        resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)

        if len(bboxes) > 0:
            bboxes *= ratio
            mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
                              bboxes[:, 3] - bboxes[:, 1]) > 1
            bboxes = bboxes[mask]
            labels = labels[mask]
        return resized_img, bboxes, labels

    def _pad(self, img):
        h, w, _ = img.shape
        if h == self.target_size[0] and w == self.target_size[1]:
            return img
        padded_img = np.full(
            (self.target_size[0], self.target_size[1], 3),
            self.fill_value,
            dtype=np.uint8)
        padded_img[:h, :w] = img
        return padded_img

    def apply(self, sample, context=None):
        image = sample['image']
        bboxes = sample['gt_bbox']
        labels = sample['gt_class']
        image, bboxes, labels = self._resize(image, bboxes, labels)
        sample['image'] = self._pad(image).astype(np.float32)
        sample['gt_bbox'] = bboxes
        sample['gt_class'] = labels
        return sample


@register_op
class RandomShift(BaseOperator):
    """
    Randomly shift image

    Args:
        prob (float): probability to do random shift.
        max_shift (int): max shift pixels
        filter_thr (int): filter gt bboxes if one side is smaller than this
    """

    def __init__(self, prob=0.5, max_shift=32, filter_thr=1):
        super(RandomShift, self).__init__()
        self.prob = prob
        self.max_shift = max_shift
        self.filter_thr = filter_thr

    def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):
        return [
            max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),
            min(im_h, im_h + shift_h)
        ]

    def apply(self, sample, context=None):
        if random.random() > self.prob:
            return sample

        im = sample['image']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        im_h, im_w = im.shape[:2]
        shift_h = random.randint(-self.max_shift, self.max_shift)
        shift_w = random.randint(-self.max_shift, self.max_shift)

        gt_bbox[:, 0::2] += shift_w
        gt_bbox[:, 1::2] += shift_h
        gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)
        gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)
        gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]
        gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]
        keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)
        if not keep.any():
            return sample

        gt_bbox = gt_bbox[keep]
        gt_class = gt_class[keep]

        # shift image
        coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)
        # shift frame to the opposite direction
        coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)
        canvas = np.zeros_like(im)
        canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \
            = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]

        sample['image'] = canvas
        sample['gt_bbox'] = gt_bbox
        sample['gt_class'] = gt_class
        return sample


@register_op
class StrongAugImage(BaseOperator):
    def __init__(self, transforms):
        super(StrongAugImage, self).__init__()
        self.transforms = Compose(transforms)

    def apply(self, sample, context=None):
        im = sample
        im['image'] = sample['image'].astype('uint8')
        results = self.transforms(im)
        sample['image'] = results['image'].astype('uint8')
        return sample


@register_op
class RandomColorJitter(BaseOperator):
    def __init__(self,
                 prob=0.8,
                 brightness=0.4,
                 contrast=0.4,
                 saturation=0.4,
                 hue=0.1):
        super(RandomColorJitter, self).__init__()
        self.prob = prob
        self.brightness = brightness
        self.contrast = contrast
        self.saturation = saturation
        self.hue = hue

    def apply(self, sample, context=None):
        if np.random.uniform(0, 1) < self.prob:
            from paddle.vision.transforms import ColorJitter
            transform = ColorJitter(self.brightness, self.contrast,
                                    self.saturation, self.hue)
            sample['image'] = transform(sample['image'].astype(np.uint8))
            sample['image'] = sample['image'].astype(np.float32)
        return sample


@register_op
class RandomGrayscale(BaseOperator):
    def __init__(self, prob=0.2):
        super(RandomGrayscale, self).__init__()
        self.prob = prob

    def apply(self, sample, context=None):
        if np.random.uniform(0, 1) < self.prob:
            from paddle.vision.transforms import Grayscale
            transform = Grayscale(num_output_channels=3)
            sample['image'] = transform(sample['image'])
        return sample


@register_op
class RandomGaussianBlur(BaseOperator):
    def __init__(self, prob=0.5, sigma=[0.1, 2.0]):
        super(RandomGaussianBlur, self).__init__()
        self.prob = prob
        self.sigma = sigma

    def apply(self, sample, context=None):
        if np.random.uniform(0, 1) < self.prob:
            sigma = np.random.uniform(self.sigma[0], self.sigma[1])
            im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)
            sample['image'] = im
        return sample


@register_op
class RandomErasing(BaseOperator):
    def __init__(self,
                 prob=0.5,
                 scale=(0.02, 0.33),
                 ratio=(0.3, 3.3),
                 value=0,
                 inplace=False):
        super(RandomErasing, self).__init__()
        assert isinstance(scale,
                          (tuple, list)), "scale should be a tuple or list"
        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
                ), "scale should be of kind (min, max) and in range [0, 1]"
        assert isinstance(ratio,
                          (tuple, list)), "ratio should be a tuple or list"
        assert (ratio[0] >= 0 and
                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
        assert isinstance(
            value, (Number, str, tuple,
                    list)), "value should be a number, tuple, list or str"
        if isinstance(value, str) and value != "random":
            raise ValueError("value must be 'random' when type is str")
        self.prob = prob
        self.scale = scale
        self.ratio = ratio
        self.value = value
        self.inplace = inplace

    def _erase(self, img, i, j, h, w, v, inplace=False):
        if not inplace:
            img = img.copy()
        img[i:i + h, j:j + w, ...] = v
        return img

    def _get_param(self, img, scale, ratio, value):
        shape = np.asarray(img).astype(np.uint8).shape
        h, w, c = shape[-3], shape[-2], shape[-1]
        img_area = h * w
        log_ratio = np.log(ratio)
        for _ in range(1):
            erase_area = np.random.uniform(*scale) * img_area
            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
            if erase_h >= h or erase_w >= w:
                continue

            if value is None:
                v = np.random.normal(size=[erase_h, erase_w, c]) * 255
            else:
                v = np.array(value)[None, None, :]
            top = np.random.randint(0, h - erase_h + 1)
            left = np.random.randint(0, w - erase_w + 1)
            return top, left, erase_h, erase_w, v
        return 0, 0, h, w, img

    def apply(self, sample, context=None):
        if random.random() < self.prob:
            if isinstance(self.value, Number):
                value = [self.value]
            elif isinstance(self.value, str):
                value = None
            else:
                value = self.value
            if value is not None and not (len(value) == 1 or len(value) == 3):
                raise ValueError(
                    "Value should be a single number or a sequence with length equals to image's channel."
                )
            im = sample['image']
            top, left, erase_h, erase_w, v = self._get_param(im, self.scale,
                                                             self.ratio, value)
            im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)
            sample['image'] = im
        return sample


@register_op
class RandomErasingCrop(BaseOperator):
    def __init__(self):
        super(RandomErasingCrop, self).__init__()
        self.transform1 = RandomErasing(
            prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random")
        self.transform2 = RandomErasing(
            prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random")
        self.transform3 = RandomErasing(
            prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random")

    def apply(self, sample, context=None):
        sample = self.transform1(sample)
        sample = self.transform2(sample)
        sample = self.transform3(sample)
        return sample


================================================
FILE: ppdet/data/transform/rotated_operators.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

from numbers import Number, Integral

import cv2
import numpy as np
import math
import copy

from .operators import register_op, BaseOperator
from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np
from ppdet.utils.logger import setup_logger
from ppdet.utils.compact import imagedraw_textsize_c
logger = setup_logger(__name__)


@register_op
class RRotate(BaseOperator):
    """ Rotate Image, Polygon, Box

    Args:
        scale (float): rotate scale
        angle (float): rotate angle
        fill_value (int, tuple): fill color
        auto_bound (bool): whether auto bound or not
    """

    def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):
        super(RRotate, self).__init__()
        self.scale = scale
        self.angle = angle
        self.fill_value = fill_value
        self.auto_bound = auto_bound

    def get_rotated_matrix(self, angle, scale, h, w):
        center = ((w - 1) * 0.5, (h - 1) * 0.5)
        matrix = cv2.getRotationMatrix2D(center, -angle, scale)
        # calculate the new size
        cos = np.abs(matrix[0, 0])
        sin = np.abs(matrix[0, 1])
        new_w = h * sin + w * cos
        new_h = h * cos + w * sin
        # calculate offset
        n_w = int(np.round(new_w))
        n_h = int(np.round(new_h))
        if self.auto_bound:
            ratio = min(w / n_w, h / n_h)
            matrix = cv2.getRotationMatrix2D(center, -angle, ratio)
        else:
            matrix[0, 2] += (new_w - w) * 0.5
            matrix[1, 2] += (new_h - h) * 0.5
            w = n_w
            h = n_h
        return matrix, h, w

    def get_rect_from_pts(self, pts, h, w):
        """ get minimum rectangle of points
        """
        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
        min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],
                                                            axis=1)
        max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],
                                                            axis=1)
        min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)
        max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)
        boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)
        return boxes

    def apply_image(self, image, matrix, h, w):
        return cv2.warpAffine(
            image, matrix, (w, h), borderValue=self.fill_value)

    def apply_pts(self, pts, matrix, h, w):
        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
        # n is number of samples and m is two times the number of points due to (x, y)
        _, m = pts.shape
        # transpose points
        pts_ = pts.reshape(-1, 2).T
        # pad 1 to convert the points to homogeneous coordinates
        padding = np.ones((1, pts_.shape[1]), pts.dtype)
        rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))
        return rotated_pts[:2, :].T.reshape(-1, m)

    def apply(self, sample, context=None):
        image = sample['image']
        h, w = image.shape[:2]
        matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)
        sample['image'] = self.apply_image(image, matrix, h, w)
        polys = sample['gt_poly']
        # TODO: segment or keypoint to be processed 
        if len(polys) > 0:
            pts = self.apply_pts(polys, matrix, h, w)
            sample['gt_poly'] = pts
            sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)

        return sample


@register_op
class RandomRRotate(BaseOperator):
    """ Random Rotate Image
    Args:
        scale (float, tuple, list): rotate scale
        scale_mode (str): mode of scale, [range, value, None]
        angle (float, tuple, list): rotate angle
        angle_mode (str): mode of angle, [range, value, None]
        fill_value (float, tuple, list): fill value
        rotate_prob (float): probability of rotation
        auto_bound (bool): whether auto bound or not
    """

    def __init__(self,
                 scale=1.0,
                 scale_mode=None,
                 angle=0.,
                 angle_mode=None,
                 fill_value=0.,
                 rotate_prob=1.0,
                 auto_bound=True):
        super(RandomRRotate, self).__init__()
        self.scale = scale
        self.scale_mode = scale_mode
        self.angle = angle
        self.angle_mode = angle_mode
        self.fill_value = fill_value
        self.rotate_prob = rotate_prob
        self.auto_bound = auto_bound

    def get_angle(self, angle, angle_mode):
        assert not angle_mode or angle_mode in [
            'range', 'value'
        ], 'angle mode should be in [range, value, None]'
        if not angle_mode:
            return angle
        elif angle_mode == 'range':
            low, high = angle
            return np.random.rand() * (high - low) + low
        elif angle_mode == 'value':
            return np.random.choice(angle)

    def get_scale(self, scale, scale_mode):
        assert not scale_mode or scale_mode in [
            'range', 'value'
        ], 'scale mode should be in [range, value, None]'
        if not scale_mode:
            return scale
        elif scale_mode == 'range':
            low, high = scale
            return np.random.rand() * (high - low) + low
        elif scale_mode == 'value':
            return np.random.choice(scale)

    def apply(self, sample, context=None):
        if np.random.rand() > self.rotate_prob:
            return sample

        angle = self.get_angle(self.angle, self.angle_mode)
        scale = self.get_scale(self.scale, self.scale_mode)
        rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)
        return rotator(sample)


@register_op
class Poly2RBox(BaseOperator):
    """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1

    Args:
        filter_threshold (int, float): threshold to filter annotations
        filter_mode (str): filter mode, ['area', 'edge']
        rbox_type (str): rbox type, ['le135', 'oc']

    """

    def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):
        super(Poly2RBox, self).__init__()
        self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)
        self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np

    def filter(self, size, threshold, mode):
        if mode == 'area':
            if size[0] * size[1] < threshold:
                return True
        elif mode == 'edge':
            if min(size) < threshold:
                return True
        return False

    def get_rbox(self, polys):
        valid_ids, rboxes, bboxes = [], [], []
        for i, poly in enumerate(polys):
            cx, cy, w, h, angle = self.rbox_fn(poly)
            if self.filter_fn((w, h)):
                continue
            rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))
            valid_ids.append(i)
            xmin, ymin = min(poly[0::2]), min(poly[1::2])
            xmax, ymax = max(poly[0::2]), max(poly[1::2])
            bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))

        if len(valid_ids) == 0:
            rboxes = np.zeros((0, 5), dtype=np.float32)
            bboxes = np.zeros((0, 4), dtype=np.float32)
        else:
            rboxes = np.stack(rboxes)
            bboxes = np.stack(bboxes)

        return rboxes, bboxes, valid_ids

    def apply(self, sample, context=None):
        rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])
        sample['gt_rbox'] = rboxes
        sample['gt_bbox'] = bboxes
        for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:
            if k in sample:
                sample[k] = sample[k][valid_ids]

        return sample


@register_op
class Poly2Array(BaseOperator):
    """ convert gt_poly to np.array for rotated bboxes
    """

    def __init__(self):
        super(Poly2Array, self).__init__()

    def apply(self, sample, context=None):
        if 'gt_poly' in sample:
            sample['gt_poly'] = np.array(
                sample['gt_poly'], dtype=np.float32).reshape((-1, 8))

        return sample


@register_op
class RResize(BaseOperator):
    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
        """
        Resize image to target size. if keep_ratio is True, 
        resize the image's long side to the maximum of target_size
        if keep_ratio is False, resize the image to target size(h, w)
        Args:
            target_size (int|list): image target size
            keep_ratio (bool): whether keep_ratio or not, default true
            interp (int): the interpolation method
        """
        super(RResize, self).__init__()
        self.keep_ratio = keep_ratio
        self.interp = interp
        if not isinstance(target_size, (Integral, Sequence)):
            raise TypeError(
                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
                format(type(target_size)))
        if isinstance(target_size, Integral):
            target_size = [target_size, target_size]
        self.target_size = target_size

    def apply_image(self, image, scale):
        im_scale_x, im_scale_y = scale

        return cv2.resize(
            image,
            None,
            None,
            fx=im_scale_x,
            fy=im_scale_y,
            interpolation=self.interp)

    def apply_pts(self, pts, scale, size):
        im_scale_x, im_scale_y = scale
        resize_w, resize_h = size
        pts[:, 0::2] *= im_scale_x
        pts[:, 1::2] *= im_scale_y
        pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)
        pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)
        return pts

    def apply(self, sample, context=None):
        """ Resize the image numpy.
        """
        im = sample['image']
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))
        if len(im.shape) != 3:
            raise ImageError('{}: image is not 3-dimensional.'.format(self))

        # apply image
        im_shape = im.shape
        if self.keep_ratio:

            im_size_min = np.min(im_shape[0:2])
            im_size_max = np.max(im_shape[0:2])

            target_size_min = np.min(self.target_size)
            target_size_max = np.max(self.target_size)

            im_scale = min(target_size_min / im_size_min,
                           target_size_max / im_size_max)

            resize_h = im_scale * float(im_shape[0])
            resize_w = im_scale * float(im_shape[1])

            im_scale_x = im_scale
            im_scale_y = im_scale
        else:
            resize_h, resize_w = self.target_size
            im_scale_y = resize_h / im_shape[0]
            im_scale_x = resize_w / im_shape[1]

        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
        sample['image'] = im.astype(np.float32)
        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
        if 'scale_factor' in sample:
            scale_factor = sample['scale_factor']
            sample['scale_factor'] = np.asarray(
                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
                dtype=np.float32)
        else:
            sample['scale_factor'] = np.asarray(
                [im_scale_y, im_scale_x], dtype=np.float32)

        # apply bbox
        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
            sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],
                                               [im_scale_x, im_scale_y],
                                               [resize_w, resize_h])

        # apply polygon
        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
            sample['gt_poly'] = self.apply_pts(sample['gt_poly'],
                                               [im_scale_x, im_scale_y],
                                               [resize_w, resize_h])

        return sample


@register_op
class RandomRFlip(BaseOperator):
    def __init__(self, prob=0.5):
        """
        Args:
            prob (float): the probability of flipping image
        """
        super(RandomRFlip, self).__init__()
        self.prob = prob
        if not (isinstance(self.prob, float)):
            raise TypeError("{}: input type is invalid.".format(self))

    def apply_image(self, image):
        return image[:, ::-1, :]

    def apply_pts(self, pts, width):
        oldx = pts[:, 0::2].copy()
        pts[:, 0::2] = width - oldx - 1
        return pts

    def apply(self, sample, context=None):
        """Filp the image and bounding box.
        Operators:
            1. Flip the image numpy.
            2. Transform the bboxes' x coordinates.
              (Must judge whether the coordinates are normalized!)
            3. Transform the segmentations' x coordinates.
              (Must judge whether the coordinates are normalized!)
        Output:
            sample: the image, bounding box and segmentation part
                    in sample are flipped.
        """
        if np.random.uniform(0, 1) < self.prob:
            im = sample['image']
            height, width = im.shape[:2]
            im = self.apply_image(im)
            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
                sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)
            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
                sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)

            sample['flipped'] = True
            sample['image'] = im
        return sample


@register_op
class VisibleRBox(BaseOperator):
    """
    In debug mode, visualize images according to `gt_box`.
    (Currently only supported when not cropping and flipping image.)
    """

    def __init__(self, output_dir='debug'):
        super(VisibleRBox, self).__init__()
        self.output_dir = output_dir
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

    def apply(self, sample, context=None):
        image = Image.fromarray(sample['image'].astype(np.uint8))
        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
        width = sample['w']
        height = sample['h']
        # gt_poly = sample['gt_rbox']
        gt_poly = sample['gt_poly']
        gt_class = sample['gt_class']
        draw = ImageDraw.Draw(image)
        for i in range(gt_poly.shape[0]):
            x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]
            draw.line(
                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
                width=2,
                fill='green')
            # draw label
            xmin = min(x1, x2, x3, x4)
            ymin = min(y1, y2, y3, y4)
            text = str(gt_class[i][0])
            tw, th = imagedraw_textsize_c(draw, text)
            draw.rectangle(
                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))

        if 'gt_keypoint' in sample.keys():
            gt_keypoint = sample['gt_keypoint']
            if self.is_normalized:
                for i in range(gt_keypoint.shape[1]):
                    if i % 2:
                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
                    else:
                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
            for i in range(gt_keypoint.shape[0]):
                keypoint = gt_keypoint[i]
                for j in range(int(keypoint.shape[0] / 2)):
                    x1 = round(keypoint[2 * j]).astype(np.int32)
                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
                    draw.ellipse(
                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
        save_path = os.path.join(self.output_dir, out_file_name)
        image.save(save_path, quality=95)
        return sample


@register_op
class Rbox2Poly(BaseOperator):
    """
    Convert rbbox format to poly format.
    """

    def __init__(self):
        super(Rbox2Poly, self).__init__()

    def apply(self, sample, context=None):
        assert 'gt_rbox' in sample
        assert sample['gt_rbox'].shape[1] == 5
        rboxes = sample['gt_rbox']
        polys = rbox2poly_np(rboxes)
        sample['gt_poly'] = polys
        xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)
        xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)
        sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)
        return sample


================================================
FILE: ppdet/data/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import numbers
import numpy as np

try:
    from collections.abc import Sequence, Mapping
except:
    from collections import Sequence, Mapping


def default_collate_fn(batch):
    """
    Default batch collating function for :code:`paddle.io.DataLoader`,
    get input data as a list of sample datas, each element in list
    if the data of a sample, and sample data should composed of list,
    dictionary, string, number, numpy array, this
    function will parse input data recursively and stack number,
    numpy array and paddle.Tensor datas as batch datas. e.g. for
    following input data:
    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
    
    
    This default collate function zipped each number and numpy array
    field together and stack each field as the batch field as follows:
    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
    Args:  
        batch(list of sample data): batch should be a list of sample data.
    
    Returns:
        Batched data: batched each number, numpy array and paddle.Tensor
                      in input data.
    """
    sample = batch[0]
    if isinstance(sample, np.ndarray):
        batch = np.stack(batch, axis=0)
        return batch
    elif isinstance(sample, numbers.Number):
        batch = np.array(batch)
        return batch
    elif isinstance(sample, (str, bytes)):
        return batch
    elif isinstance(sample, Mapping):
        return {
            key: default_collate_fn([d[key] for d in batch])
            for key in sample
        }
    elif isinstance(sample, Sequence):
        sample_fields_num = len(sample)
        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
            raise RuntimeError(
                "fileds number not same among samples in a batch")
        return [default_collate_fn(fields) for fields in zip(*batch)]

    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
                    "dict, list, number, but got {}".format(type(sample)))


================================================
FILE: ppdet/engine/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import trainer
from .trainer import *

from . import trainer_cot
from .trainer_cot import *

from . import callbacks
from .callbacks import *

from . import env
from .env import *

__all__ = trainer.__all__ \
        + callbacks.__all__ \
        + env.__all__

from . import tracker
from .tracker import *
__all__ = __all__ + tracker.__all__

from . import trainer_ssod
from .trainer_ssod import *
__all__ = __all__ + trainer_ssod.__all__


================================================
FILE: ppdet/engine/callbacks.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import gc
import sys
import datetime
import six
import copy
import json

import paddle
import paddle.distributed as dist

from ppdet.utils.checkpoint import save_model, save_semi_model, save_model_info, update_train_results
from ppdet.metrics import get_infer_results

from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')

__all__ = [
    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
    'VisualDLWriter', 'SniperProposalsGenerator'
]


class Callback(object):
    def __init__(self, model):
        self.model = model
        log_ranks = self.model.cfg.get("log_ranks", '0')
        if isinstance(log_ranks, str):
            self.log_ranks = [int(i) for i in log_ranks.split(',')]
        elif isinstance(log_ranks, int):
            self.log_ranks = [log_ranks]
        self.logger = setup_logger('ppdet.engine.callbacks',log_ranks=self.log_ranks)

    def on_step_begin(self, status):
        pass

    def on_step_end(self, status):
        pass

    def on_epoch_begin(self, status):
        pass

    def on_epoch_end(self, status):
        pass

    def on_train_begin(self, status):
        pass

    def on_train_end(self, status):
        pass


class ComposeCallback(object):
    def __init__(self, callbacks):
        callbacks = [c for c in list(callbacks) if c is not None]
        for c in callbacks:
            assert isinstance(
                c, Callback), "callback should be subclass of Callback"
        self._callbacks = callbacks

    def on_step_begin(self, status):
        for c in self._callbacks:
            c.on_step_begin(status)

    def on_step_end(self, status):
        for c in self._callbacks:
            c.on_step_end(status)

    def on_epoch_begin(self, status):
        for c in self._callbacks:
            c.on_epoch_begin(status)

    def on_epoch_end(self, status):
        for c in self._callbacks:
            c.on_epoch_end(status)

    def on_train_begin(self, status):
        for c in self._callbacks:
            c.on_train_begin(status)

    def on_train_end(self, status):
        for c in self._callbacks:
            c.on_train_end(status)


class LogPrinter(Callback):
    def __init__(self, model):
        super(LogPrinter, self).__init__(model)

    def on_step_end(self, status):
        
        if dist.get_world_size() < 2 or dist.get_rank() in self.log_ranks:
            mode = status['mode']
            if mode == 'train':
                epoch_id = status['epoch_id']
                step_id = status['step_id']
                steps_per_epoch = status['steps_per_epoch']
                training_staus = status['training_staus']
                batch_time = status['batch_time']
                data_time = status['data_time']

                epoches = self.model.cfg.epoch
                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
                ))]['batch_size']

                logs = training_staus.log()
                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
                if step_id % self.model.cfg.log_iter == 0:
                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
                    eta_sec = eta_steps * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    max_mem_reserved_str = ""
                    max_mem_allocated_str = ""
                    print_mem_info = self.model.cfg.get("print_mem_info", True)
                    if paddle.device.is_compiled_with_cuda() and print_mem_info:
                        max_mem_reserved_str = f", max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB"
                        max_mem_allocated_str = f", max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
                    fmt = ' '.join([
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        'learning_rate: {lr:.6f}',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s'
                        '{max_mem_reserved_str}'
                        '{max_mem_allocated_str}'
                    ])
                    fmt = fmt.format(
                        epoch_id,
                        step_id,
                        steps_per_epoch,
                        lr=status['learning_rate'],
                        meters=logs,
                        eta=eta_str,
                        btime=str(batch_time),
                        dtime=str(data_time),
                        ips=ips,
                        max_mem_reserved_str=max_mem_reserved_str,
                        max_mem_allocated_str=max_mem_allocated_str)
                    self.logger.info(fmt)
            if mode == 'eval':
                step_id = status['step_id']
                if step_id % 100 == 0:
                    self.logger.info("Eval iter: {}".format(step_id))

    def on_epoch_end(self, status):
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            mode = status['mode']
            if mode == 'eval':
                sample_num = status['sample_num']
                cost_time = status['cost_time']
                self.logger.info('Total sample number: {}, average FPS: {}'.format(
                    sample_num, sample_num / cost_time))


class Checkpointer(Callback):
    def __init__(self, model):
        super(Checkpointer, self).__init__(model)
        self.best_ap = -1000.
        self.save_dir = self.model.cfg.save_dir
        self.uniform_output_enabled = self.model.cfg.get("uniform_output_enabled", False)
        if hasattr(self.model.model, 'student_model'):
            self.weight = self.model.model.student_model
        else:
            self.weight = self.model.model
        
    def on_epoch_end(self, status):
        # Checkpointer only performed during training
        mode = status['mode']
        epoch_id = status['epoch_id']
        weight = None
        save_name = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            end_epoch = self.model.cfg.epoch
            save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final"
            if mode == 'train':
                end_epoch = self.model.cfg.epoch
                if (
                        epoch_id + 1
                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
                    save_name = str(
                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                    weight = self.weight.state_dict()
            elif mode == 'eval':
                for metric in self.model._metrics:
                    map_res = metric.get_results()
                    eval_func = "ap"
                    if 'pose3d' in map_res:
                        key = 'pose3d'
                        eval_func = "mpjpe"
                    elif 'bbox' in map_res:
                        key = 'bbox'
                    elif 'keypoint' in map_res:
                        key = 'keypoint'
                    else:
                        key = 'mask'

                    key = self.model.cfg.get('target_metrics', key)

                    if key not in map_res:
                        logger.warning("Evaluation results empty, this may be due to " \
                                    "training iterations being too few or not " \
                                    "loading the correct weights.")
                        return
                    epoch_ap = map_res[key][0]
                    epoch_metric = {
                        'metric': abs(epoch_ap),
                        'epoch': epoch_id + 1
                    }
                    save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, f"{save_name}.pdstates")
                    paddle.save(epoch_metric, save_path)
                    if self.uniform_output_enabled:
                        save_model_info(epoch_metric, self.save_dir, save_name)
                        update_train_results(self.model.cfg, save_name, epoch_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)
                    if 'save_best_model' in status and status['save_best_model']:
                        if epoch_ap >= self.best_ap:
                            self.best_ap = epoch_ap
                            save_name = 'best_model'
                            weight = self.weight.state_dict()
                            best_metric = {
                                'metric': abs(self.best_ap),
                                'epoch': epoch_id + 1
                            }
                            save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, "best_model.pdstates")
                            paddle.save(best_metric, save_path)
                            if self.uniform_output_enabled:
                                save_model_info(best_metric, self.save_dir, save_name)
                                update_train_results(self.model.cfg, save_name, best_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)
                        logger.info("Best test {} {} is {:0.3f}.".format(
                            key, eval_func, abs(self.best_ap)))
            if weight:
                if self.model.use_ema:
                    exchange_save_model = status.get('exchange_save_model',
                                                     False)
                    if not exchange_save_model:
                        # save model and ema_model
                        save_model(
                            status['weight'],
                            self.model.optimizer,
                            os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,
                            save_name,
                            epoch_id + 1,
                            ema_model=weight)
                        if self.uniform_output_enabled:
                            self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True)
                            gc.collect()
                    else:
                        # save model(student model) and ema_model(teacher model)
                        # in DenseTeacher SSOD, the teacher model will be higher,
                        # so exchange when saving pdparams
                        student_model = status['weight']  # model
                        teacher_model = weight  # ema_model
                        save_model(
                            teacher_model,
                            self.model.optimizer,
                            self.save_dir,
                            save_name,
                            epoch_id + 1,
                            ema_model=student_model)
                        del teacher_model
                        del student_model
                else:
                    save_model(weight, self.model.optimizer, os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,
                               save_name, epoch_id + 1)
                    if self.uniform_output_enabled:
                        self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True)
                        gc.collect()


class WiferFaceEval(Callback):
    def __init__(self, model):
        super(WiferFaceEval, self).__init__(model)

    def on_epoch_begin(self, status):
        assert self.model.mode == 'eval', \
            "WiferFaceEval can only be set during evaluation"
        for metric in self.model._metrics:
            metric.update(self.model.model)
        sys.exit()


class VisualDLWriter(Callback):
    """
    Use VisualDL to log data or image
    """

    def __init__(self, model):
        super(VisualDLWriter, self).__init__(model)

        assert six.PY3, "VisualDL requires Python >= 3.5"
        try:
            from visualdl import LogWriter
        except Exception as e:
            logger.error('visualdl not found, plaese install visualdl. '
                         'for example: `pip install visualdl`.')
            raise e
        self.vdl_writer = LogWriter(
            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
        self.vdl_loss_step = 0
        self.vdl_mAP_step = 0
        self.vdl_image_step = 0
        self.vdl_image_frame = 0

    def on_step_end(self, status):
        mode = status['mode']
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'train':
                training_staus = status['training_staus']
                for loss_name, loss_value in training_staus.get().items():
                    self.vdl_writer.add_scalar(loss_name, loss_value,
                                               self.vdl_loss_step)
                self.vdl_loss_step += 1
            elif mode == 'test':
                ori_image = status['original_image']
                result_image = status['result_image']
                self.vdl_writer.add_image(
                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
                    self.vdl_image_step)
                self.vdl_writer.add_image(
                    "result/frame_{}".format(self.vdl_image_frame),
                    result_image, self.vdl_image_step)
                self.vdl_image_step += 1
                # each frame can display ten pictures at most.
                if self.vdl_image_step % 10 == 0:
                    self.vdl_image_step = 0
                    self.vdl_image_frame += 1

    def on_epoch_end(self, status):
        mode = status['mode']
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'eval':
                for metric in self.model._metrics:
                    for key, map_value in metric.get_results().items():
                        self.vdl_writer.add_scalar("{}-mAP".format(key),
                                                   map_value[0],
                                                   self.vdl_mAP_step)
                self.vdl_mAP_step += 1


class WandbCallback(Callback):
    def __init__(self, model):
        super(WandbCallback, self).__init__(model)

        try:
            import wandb
            self.wandb = wandb
        except Exception as e:
            logger.error('wandb not found, please install wandb. '
                         'Use: `pip install wandb`.')
            raise e

        self.wandb_params = model.cfg.get('wandb', None)
        self.save_dir = self.model.cfg.save_dir
        if self.wandb_params is None:
            self.wandb_params = {}
        for k, v in model.cfg.items():
            if k.startswith("wandb_"):
                self.wandb_params.update({k.lstrip("wandb_"): v})

        self._run = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            _ = self.run
            self.run.config.update(self.model.cfg)
            self.run.define_metric("epoch")
            self.run.define_metric("eval/*", step_metric="epoch")

        self.best_ap = -1000.
        self.fps = []

    @property
    def run(self):
        if self._run is None:
            if self.wandb.run is not None:
                logger.info(
                    "There is an ongoing wandb run which will be used"
                    "for logging. Please use `wandb.finish()` to end that"
                    "if the behaviour is not intended")
                self._run = self.wandb.run
            else:
                self._run = self.wandb.init(**self.wandb_params)
        return self._run

    def save_model(self,
                   optimizer,
                   save_dir,
                   save_name,
                   last_epoch,
                   ema_model=None,
                   ap=None,
                   fps=None,
                   tags=None):
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            model_path = os.path.join(save_dir, save_name)
            metadata = {}
            metadata["last_epoch"] = last_epoch
            if ap:
                metadata["ap"] = ap

            if fps:
                metadata["fps"] = fps

            if ema_model is None:
                ema_artifact = self.wandb.Artifact(
                    name="ema_model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)
                model_artifact = self.wandb.Artifact(
                    name="model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)

                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
                model_artifact.add_file(model_path + ".pdparams", name="model")

                self.run.log_artifact(ema_artifact, aliases=tags)
                self.run.log_artfact(model_artifact, aliases=tags)
            else:
                model_artifact = self.wandb.Artifact(
                    name="model-{}".format(self.run.id),
                    type="model",
                    metadata=metadata)
                model_artifact.add_file(model_path + ".pdparams", name="model")
                self.run.log_artifact(model_artifact, aliases=tags)

    def on_step_end(self, status):

        mode = status['mode']
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'train':
                training_status = status['training_staus'].get()
                for k, v in training_status.items():
                    training_status[k] = float(v)

                # calculate ips, data_cost, batch_cost
                batch_time = status['batch_time']
                data_time = status['data_time']
                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
                ))]['batch_size']

                ips = float(batch_size) / float(batch_time.avg)
                data_cost = float(data_time.avg)
                batch_cost = float(batch_time.avg)

                metrics = {"train/" + k: v for k, v in training_status.items()}

                metrics["train/ips"] = ips
                metrics["train/data_cost"] = data_cost
                metrics["train/batch_cost"] = batch_cost

                self.fps.append(ips)
                self.run.log(metrics)

    def on_epoch_end(self, status):
        mode = status['mode']
        epoch_id = status['epoch_id']
        save_name = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if mode == 'train':
                fps = sum(self.fps) / len(self.fps)
                self.fps = []

                end_epoch = self.model.cfg.epoch
                if (
                        epoch_id + 1
                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
                    save_name = str(
                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                    tags = ["latest", "epoch_{}".format(epoch_id)]
                    self.save_model(
                        self.model.optimizer,
                        self.save_dir,
                        save_name,
                        epoch_id + 1,
                        self.model.use_ema,
                        fps=fps,
                        tags=tags)
            if mode == 'eval':
                sample_num = status['sample_num']
                cost_time = status['cost_time']

                fps = sample_num / cost_time

                merged_dict = {}
                for metric in self.model._metrics:
                    for key, map_value in metric.get_results().items():
                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
                merged_dict["epoch"] = status["epoch_id"]
                merged_dict["eval/fps"] = sample_num / cost_time

                self.run.log(merged_dict)

                if 'save_best_model' in status and status['save_best_model']:
                    for metric in self.model._metrics:
                        map_res = metric.get_results()
                        if 'pose3d' in map_res:
                            key = 'pose3d'
                        elif 'bbox' in map_res:
                            key = 'bbox'
                        elif 'keypoint' in map_res:
                            key = 'keypoint'
                        else:
                            key = 'mask'
                        if key not in map_res:
                            logger.warning("Evaluation results empty, this may be due to " \
                                        "training iterations being too few or not " \
                                        "loading the correct weights.")
                            return
                        if map_res[key][0] >= self.best_ap:
                            self.best_ap = map_res[key][0]
                            save_name = 'best_model'
                            tags = ["best", "epoch_{}".format(epoch_id)]

                            self.save_model(
                                self.model.optimizer,
                                self.save_dir,
                                save_name,
                                last_epoch=epoch_id + 1,
                                ema_model=self.model.use_ema,
                                ap=abs(self.best_ap),
                                fps=fps,
                                tags=tags)

    def on_train_end(self, status):
        self.run.finish()


class SniperProposalsGenerator(Callback):
    def __init__(self, model):
        super(SniperProposalsGenerator, self).__init__(model)
        ori_dataset = self.model.dataset
        self.dataset = self._create_new_dataset(ori_dataset)
        self.loader = self.model.loader
        self.cfg = self.model.cfg
        self.infer_model = self.model.model

    def _create_new_dataset(self, ori_dataset):
        dataset = copy.deepcopy(ori_dataset)
        # init anno_cropper
        dataset.init_anno_cropper()
        # generate infer roidbs
        ori_roidbs = dataset.get_ori_roidbs()
        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
        # set new roidbs
        dataset.set_roidbs(roidbs)

        return dataset

    def _eval_with_loader(self, loader):
        results = []
        with paddle.no_grad():
            self.infer_model.eval()
            for step_id, data in enumerate(loader):
                outs = self.infer_model(data)
                for key in ['im_shape', 'scale_factor', 'im_id']:
                    outs[key] = data[key]
                for key, value in outs.items():
                    if hasattr(value, 'numpy'):
                        outs[key] = value.numpy()

                results.append(outs)

        return results

    def on_train_end(self, status):
        self.loader.dataset = self.dataset
        results = self._eval_with_loader(self.loader)
        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
        # sniper
        proposals = []
        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
        for outs in results:
            batch_res = get_infer_results(outs, clsid2catid)
            start = 0
            for i, im_id in enumerate(outs['im_id']):
                bbox_num = outs['bbox_num']
                end = start + bbox_num[i]
                bbox_res = batch_res['bbox'][start:end] \
                    if 'bbox' in batch_res else None
                if bbox_res:
                    proposals += bbox_res
        logger.info("save proposals in {}".format(self.cfg.proposals_path))
        with open(self.cfg.proposals_path, 'w') as f:
            json.dump(proposals, f)


class SemiLogPrinter(LogPrinter):
    def __init__(self, model):
        super(SemiLogPrinter, self).__init__(model)

    def on_step_end(self, status):
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            mode = status['mode']
            if mode == 'train':
                epoch_id = status['epoch_id']
                step_id = status['step_id']
                iter_id = status['iter_id']
                steps_per_epoch = status['steps_per_epoch']
                training_staus = status['training_staus']
                batch_time = status['batch_time']
                data_time = status['data_time']

                epoches = self.model.cfg.epoch
                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
                ))]['batch_size']
                iters = epoches * steps_per_epoch
                logs = training_staus.log()
                iter_space_fmt = ':' + str(len(str(iters))) + 'd'
                space_fmt = ':' + str(len(str(iters))) + 'd'
                if step_id % self.model.cfg.log_iter == 0:
                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
                    eta_sec = eta_steps * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    fmt = ' '.join([
                        '{' + iter_space_fmt + '}/{} iters',
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        'learning_rate: {lr:.6f}',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s',
                    ])
                    fmt = fmt.format(
                        iter_id,
                        iters,
                        epoch_id,
                        step_id,
                        steps_per_epoch,
                        lr=status['learning_rate'],
                        meters=logs,
                        eta=eta_str,
                        btime=str(batch_time),
                        dtime=str(data_time),
                        ips=ips)
                    logger.info(fmt)
            if mode == 'eval':
                step_id = status['step_id']
                if step_id % 100 == 0:
                    logger.info("Eval iter: {}".format(step_id))


class SemiCheckpointer(Checkpointer):
    def __init__(self, model):
        super(SemiCheckpointer, self).__init__(model)
        cfg = self.model.cfg
        self.best_ap = 0.
        self.save_dir = os.path.join(self.model.cfg.save_dir,
                                     self.model.cfg.filename)
        if hasattr(self.model.model, 'student') and hasattr(self.model.model,
                                                            'teacher'):
            self.weight = (self.model.model.teacher, self.model.model.student)
        elif hasattr(self.model.model, 'student') or hasattr(self.model.model,
                                                             'teacher'):
            raise AttributeError(
                "model has no attribute 'student' or 'teacher'")
        else:
            raise AttributeError(
                "model has no attribute 'student' and 'teacher'")

    def every_n_iters(self, iter_id, n):
        return (iter_id + 1) % n == 0 if n > 0 else False

    def on_step_end(self, status):
        # Checkpointer only performed during training
        mode = status['mode']
        eval_interval = status['eval_interval']
        save_interval = status['save_interval']
        iter_id = status['iter_id']
        epoch_id = status['epoch_id']
        t_weight = None
        s_weight = None
        save_name = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if self.every_n_iters(iter_id, save_interval) and mode == 'train':
                save_name = "last_epoch"
                # save_name = str(iter_id + 1)
                t_weight = self.weight[0].state_dict()
                s_weight = self.weight[1].state_dict()
                save_semi_model(t_weight, s_weight, self.model.optimizer,
                                self.save_dir, save_name, epoch_id + 1,
                                iter_id + 1)

    def on_epoch_end(self, status):
        # Checkpointer only performed during training
        mode = status['mode']
        eval_interval = status['eval_interval']
        save_interval = status['save_interval']
        iter_id = status['iter_id']
        epoch_id = status['epoch_id']
        t_weight = None
        s_weight = None
        save_name = None
        if dist.get_world_size() < 2 or dist.get_rank() == 0:
            if self.every_n_iters(iter_id, eval_interval) and mode == 'eval':
                if 'save_best_model' in status and status['save_best_model']:
                    for metric in self.model._metrics:
                        map_res = metric.get_results()
                        if 'bbox' in map_res:
                            key = 'bbox'
                        elif 'keypoint' in map_res:
                            key = 'keypoint'
                        else:
                            key = 'mask'
                        if key not in map_res:
                            logger.warning("Evaluation results empty, this may be due to " \
                                        "training iterations being too few or not " \
                                        "loading the correct weights.")
                            return
                        if map_res[key][0] > self.best_ap:
                            self.best_ap = map_res[key][0]
                            save_name = 'best_model'
                            t_weight = self.weight[0].state_dict()
                            s_weight = self.weight[1].state_dict()
                        logger.info("Best teacher test {} ap is {:0.3f}.".
                                    format(key, self.best_ap))
                    if t_weight and s_weight:
                        save_semi_model(t_weight, s_weight,
                                        self.model.optimizer, self.save_dir,
                                        save_name, epoch_id + 1, iter_id + 1)


================================================
FILE: ppdet/engine/env.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import numpy as np

import paddle
from paddle.distributed import fleet

__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']


def init_fleet_env(find_unused_parameters=False):
    strategy = fleet.DistributedStrategy()
    strategy.find_unused_parameters = find_unused_parameters
    fleet.init(is_collective=True, strategy=strategy)


def init_parallel_env():
    env = os.environ
    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    paddle.distributed.init_parallel_env()


def set_random_seed(seed):
    paddle.seed(seed)
    random.seed(seed)
    np.random.seed(seed)


================================================
FILE: ppdet/engine/export_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import yaml
from collections import OrderedDict

import paddle
from ppdet.data.source.category import get_categories
from ppdet.core.workspace import load_config

from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')

# Global dictionary
TRT_MIN_SUBGRAPH = {
    'YOLO': 3,
    'PPYOLOE': 3,
    'SSD': 60,
    'RCNN': 40,
    'RetinaNet': 40,
    'S2ANet': 80,
    'EfficientDet': 40,
    'Face': 3,
    'TTFNet': 60,
    'FCOS': 16,
    'SOLOv2': 60,
    'HigherHRNet': 3,
    'HRNet': 3,
    'DeepSORT': 3,
    'ByteTrack': 10,
    'CenterTrack': 5,
    'JDE': 10,
    'FairMOT': 5,
    'GFL': 16,
    'PicoDet': 3,
    'CenterNet': 5,
    'TOOD': 5,
    'YOLOX': 8,
    'YOLOF': 40,
    'METRO_Body': 3,
    'DETR': 3,
    'CLRNet': 3
}

KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
LANE_ARCH = ['CLRNet']

TO_STATIC_SPEC = {
    'yolov3_darknet53_270e_coco': [{
        'im_id': paddle.static.InputSpec(
            name='im_id', shape=[-1, 1], dtype='float32'),
        'is_crowd': paddle.static.InputSpec(
            name='is_crowd', shape=[-1, 50], dtype='float32'),
        'gt_bbox': paddle.static.InputSpec(
            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
        'curr_iter': paddle.static.InputSpec(
            name='curr_iter', shape=[-1], dtype='float32'),
        'curr_epoch': paddle.static.InputSpec(
            name='curr_epoch', shape=[-1], dtype='int64'),
        'image': paddle.static.InputSpec(
            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
        'im_shape': paddle.static.InputSpec(
            name='im_shape', shape=[-1, 2], dtype='float32'),
        'scale_factor': paddle.static.InputSpec(
            name='scale_factor', shape=[-1, 2], dtype='float32'),
        'target0': paddle.static.InputSpec(
            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
        'target1': paddle.static.InputSpec(
            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
        'target2': paddle.static.InputSpec(
            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
    }],
    'tinypose_128x96': [{
        'center': paddle.static.InputSpec(
            name='center', shape=[-1, 2], dtype='float32'),
        'scale': paddle.static.InputSpec(
            name='scale', shape=[-1, 2], dtype='float32'),
        'im_id': paddle.static.InputSpec(
            name='im_id', shape=[-1, 1], dtype='float32'),
        'image': paddle.static.InputSpec(
            name='image', shape=[-1, 3, 128, 96], dtype='float32'),
        'score': paddle.static.InputSpec(
            name='score', shape=[-1], dtype='float32'),
        'rotate': paddle.static.InputSpec(
            name='rotate', shape=[-1], dtype='float32'),
        'target': paddle.static.InputSpec(
            name='target', shape=[-1, 17, 32, 24], dtype='float32'),
        'target_weight': paddle.static.InputSpec(
            name='target_weight', shape=[-1, 17, 1], dtype='float32'),
    }],
    'fcos_r50_fpn_1x_coco': [{
        'im_id': paddle.static.InputSpec(
            name='im_id', shape=[-1, 1], dtype='float32'),
        'curr_iter': paddle.static.InputSpec(
            name='curr_iter', shape=[-1], dtype='float32'),
        'curr_epoch': paddle.static.InputSpec(
            name='curr_epoch', shape=[-1], dtype='int64'),
        'image': paddle.static.InputSpec(
            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
        'im_shape': paddle.static.InputSpec(
            name='im_shape', shape=[-1, 2], dtype='float32'),
        'scale_factor': paddle.static.InputSpec(
            name='scale_factor', shape=[-1, 2], dtype='float32'),
        'reg_target0': paddle.static.InputSpec(
            name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),
        'labels0': paddle.static.InputSpec(
            name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),
        'centerness0': paddle.static.InputSpec(
            name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),
        'reg_target1': paddle.static.InputSpec(
            name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),
        'labels1': paddle.static.InputSpec(
            name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),
        'centerness1': paddle.static.InputSpec(
            name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),
        'reg_target2': paddle.static.InputSpec(
            name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),
        'labels2': paddle.static.InputSpec(
            name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),
        'centerness2': paddle.static.InputSpec(
            name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),
        'reg_target3': paddle.static.InputSpec(
            name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),
        'labels3': paddle.static.InputSpec(
            name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),
        'centerness3': paddle.static.InputSpec(
            name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),
        'reg_target4': paddle.static.InputSpec(
            name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),
        'labels4': paddle.static.InputSpec(
            name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),
        'centerness4': paddle.static.InputSpec(
            name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),
    }],
    'picodet_s_320_coco_lcnet': [{
        'im_id': paddle.static.InputSpec(
            name='im_id', shape=[-1, 1], dtype='float32'),
        'is_crowd': paddle.static.InputSpec(
            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
        'gt_class': paddle.static.InputSpec(
            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
        'gt_bbox': paddle.static.InputSpec(
            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
        'curr_iter': paddle.static.InputSpec(
            name='curr_iter', shape=[-1], dtype='float32'),
        'curr_epoch': paddle.static.InputSpec(
            name='curr_epoch', shape=[-1], dtype='int64'),
        'image': paddle.static.InputSpec(
            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
        'im_shape': paddle.static.InputSpec(
            name='im_shape', shape=[-1, 2], dtype='float32'),
        'scale_factor': paddle.static.InputSpec(
            name='scale_factor', shape=[-1, 2], dtype='float32'),
        'pad_gt_mask': paddle.static.InputSpec(
            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
    }],
    'ppyoloe_crn_s_300e_coco': [{
        'im_id': paddle.static.InputSpec(
            name='im_id', shape=[-1, 1], dtype='float32'),
        'is_crowd': paddle.static.InputSpec(
            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
        'gt_class': paddle.static.InputSpec(
            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
        'gt_bbox': paddle.static.InputSpec(
            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
        'curr_iter': paddle.static.InputSpec(
            name='curr_iter', shape=[-1], dtype='float32'),
        'curr_epoch': paddle.static.InputSpec(
            name='curr_epoch', shape=[-1], dtype='int64'),
        'image': paddle.static.InputSpec(
            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
        'im_shape': paddle.static.InputSpec(
            name='im_shape', shape=[-1, 2], dtype='float32'),
        'scale_factor': paddle.static.InputSpec(
            name='scale_factor', shape=[-1, 2], dtype='float32'),
        'pad_gt_mask': paddle.static.InputSpec(
            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
    }],
}


def apply_to_static(config, model):
    filename = config.get('filename', None)
    spec = TO_STATIC_SPEC.get(filename, None)
    model = paddle.jit.to_static(model, input_spec=spec)
    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
    return model


def _prune_input_spec(input_spec, program, targets):
    # try to prune static program to figure out pruned input spec
    # so we perform following operations in static mode
    device = paddle.get_device()
    paddle.enable_static()
    paddle.set_device(device)
    pruned_input_spec = [{}]
    program = program.clone()
    program = program._prune(targets=targets)
    global_block = program.global_block()
    pir_value_set = set()
    if paddle.framework.use_pir_api():
        for op in global_block.ops:
            if op.name() == 'pd_op.data':
                pir_value_set.insert(op.attrs()["name"])
    for name, spec in input_spec[0].items():
        if paddle.framework.use_pir_api():
            if name in pir_value_set:
                pruned_input_spec[0][name] = spec
        else:
            try:
                v = global_block.var(name)
                pruned_input_spec[0][name] = spec
            except Exception:
                pass
    paddle.disable_static(place=device)
    return pruned_input_spec


def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
    preprocess_list = []
    label_list = []
    if arch != "lane_arch":
        anno_file = dataset_cfg.get_anno()

        clsid2catid, catid2name = get_categories(metric, anno_file, arch)

        label_list = [str(cat) for cat in catid2name.values()]

    fuse_normalize = reader_cfg.get('fuse_normalize', False)
    sample_transforms = reader_cfg['sample_transforms']
    hpi_dynamic_shape = None
    for st in sample_transforms[1:]:
        for key, value in st.items():
            p = {'type': key}
            if key == 'Resize':
                if int(image_shape[1]) != -1:
                    value['target_size'] = image_shape[1:]
                    hpi_dynamic_shape = image_shape[1:]
                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR
            if fuse_normalize and key == 'NormalizeImage':
                continue
            p.update(value)
            preprocess_list.append(p)
    batch_transforms = reader_cfg.get('batch_transforms', None)
    if batch_transforms:
        for bt in batch_transforms:
            for key, value in bt.items():
                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
                if key == 'PadBatch':
                    preprocess_list.append({
                        'type': 'PadStride',
                        'stride': value['pad_to_stride']
                    })
                    break
                elif key == "CULaneResize":
                    # cut and resize
                    p = {'type': key}
                    p.update(value)
                    p.update({"cut_height": dataset_cfg.cut_height})
                    preprocess_list.append(p)
                    break

    return preprocess_list, label_list, hpi_dynamic_shape


def _parse_tracker(tracker_cfg):
    tracker_params = {}
    for k, v in tracker_cfg.items():
        tracker_params.update({k: v})
    return tracker_params


def _dump_infer_config(config, path, image_shape, model):
    arch_state = False
    from ppdet.core.config.yaml_helpers import setup_orderdict
    setup_orderdict()
    use_dynamic_shape = True if image_shape[2] == -1 else False
    infer_cfg = OrderedDict({
        'mode': 'paddle',
        'draw_threshold': 0.5,
        'metric': config['metric'],
        'use_dynamic_shape': use_dynamic_shape
    })
    if config.get('pdx_model_name', None):
        infer_cfg["Global"] = {"model_name": config["pdx_model_name"]}
    export_onnx = config.get('export_onnx', False)
    export_eb = config.get('export_eb', False)

    infer_arch = config['architecture']
    if 'RCNN' in infer_arch and export_onnx:
        logger.warning(
            "Exporting RCNN model to ONNX only support batch_size = 1")
        infer_cfg['export_onnx'] = True
        infer_cfg['export_eb'] = export_eb

    if infer_arch in MOT_ARCH:
        if infer_arch == 'DeepSORT':
            tracker_cfg = config['DeepSORTTracker']
        elif infer_arch == 'CenterTrack':
            tracker_cfg = config['CenterTracker']
        else:
            tracker_cfg = config['JDETracker']
        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)

    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
        if arch in infer_arch:
            infer_cfg['arch'] = arch
            infer_cfg['min_subgraph_size'] = min_subgraph_size
            arch_state = True
            break

    if infer_arch == 'PPYOLOEWithAuxHead':
        infer_arch = 'PPYOLOE'

    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
        infer_cfg['arch'] = infer_arch
        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
        arch_state = True

    if infer_arch == 'DETR' and config.get('with_mask', False):
        infer_cfg['mask'] = True

    if not arch_state:
        logger.error(
            'Architecture: {} is not supported for exporting model now.\n'.
            format(infer_arch) +
            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
        os._exit(0)
    if 'mask_head' in config[config['architecture']] and config[config[
            'architecture']]['mask_head']:
        infer_cfg['mask'] = True
    if 'with_mask' in config[config['architecture']] and config[config[
            'architecture']]['with_mask']:
        infer_cfg['mask'] = True
    label_arch = 'detection_arch'
    if infer_arch in KEYPOINT_ARCH:
        label_arch = 'keypoint_arch'

    if infer_arch in LANE_ARCH:
        infer_cfg['arch'] = infer_arch
        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
        infer_cfg['img_w'] = config['img_w']
        infer_cfg['ori_img_h'] = config['ori_img_h']
        infer_cfg['cut_height'] = config['cut_height']
        label_arch = 'lane_arch'
        head_name = "CLRHead"
        infer_cfg['conf_threshold'] = config[head_name]['conf_threshold']
        infer_cfg['nms_thres'] = config[head_name]['nms_thres']
        infer_cfg['max_lanes'] = config[head_name]['max_lanes']
        infer_cfg['num_points'] = config[head_name]['num_points']
        arch_state = True

    if infer_arch in MOT_ARCH:
        if config['metric'] in ['COCO', 'VOC']:
            # MOT model run as Detector
            reader_cfg = config['TestReader']
            dataset_cfg = config['TestDataset']
        else:
            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
            label_arch = 'mot_arch'
            reader_cfg = config['TestMOTReader']
            dataset_cfg = config['TestMOTDataset']
    else:
        reader_cfg = config['TestReader']
        dataset_cfg = config['TestDataset']

    infer_cfg['Preprocess'], infer_cfg['label_list'], hpi_dynamic_shape = _parse_reader(
        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
    if config.get("uniform_output_enabled", None):
        def get_dynamic_shapes(hpi_shape):
            return [[1, 3] + hpi_shape, [1, 3] + hpi_shape, [8, 3] + hpi_shape]

        dynamic_shapes = get_dynamic_shapes(hpi_dynamic_shape) if hpi_dynamic_shape else [
            [1, 3, 320, 320],
            [1, 3, 640, 640],
            [8, 3, 1280, 1280]
        ]
        shapes = {
            "image": dynamic_shapes,
            "im_shape": [[1, 2], [1, 2], [8, 2]],
            "scale_factor": [[1, 2], [1, 2], [8, 2]]
        }
        trt_dynamic_shape = [
            [dim for _ in range(shape[0]) for dim in shape[2:]]
            for shape in dynamic_shapes
        ]
        trt_dynamic_shape_input_data = {
            "im_shape": trt_dynamic_shape,
            "scale_factor": [
                [2, 2],
                [1, 1],
                [0.67 for _ in range(2 * shapes["scale_factor"][-1][0])]
            ]
        }
        hpi_config = OrderedDict({
            "backend_configs": OrderedDict({
                "paddle_infer": OrderedDict({
                    "trt_dynamic_shapes": shapes,
                    "trt_dynamic_shape_input_data": trt_dynamic_shape_input_data
                }),
                "tensorrt": OrderedDict({
                    "dynamic_shapes": shapes
                })
            })
        })
        infer_cfg["Hpi"] = hpi_config

    if infer_arch == 'PicoDet':
        if hasattr(config, 'export') and config['export'].get(
                'post_process',
                False) and not config['export'].get('benchmark', False):
            infer_cfg['arch'] = 'GFL'
        head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
        infer_cfg['NMS'] = config[head_name]['nms']
        # In order to speed up the prediction, the threshold of nms 
        # is adjusted here, which can be changed in infer_cfg.yml
        config[head_name]['nms']["score_threshold"] = 0.3
        config[head_name]['nms']["nms_threshold"] = 0.5
        infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']

    yaml.dump(infer_cfg, open(path, 'w'))
    logger.info("Export inference config file to {}".format(os.path.join(path)))


================================================
FILE: ppdet/engine/naive_sync_bn.py
================================================
import paddle.distributed as dist
import math
import paddle
import paddle.nn as nn


class _AllReduce(paddle.autograd.PyLayer):
    @staticmethod
    def forward(ctx, input):
        input_list = [paddle.zeros_like(input) for k in range(dist.get_world_size())]
        # Use allgather instead of allreduce since I don't trust in-place operations ..
        dist.all_gather(input_list, input, sync_op=True)
        inputs = paddle.stack(input_list, axis=0)
        return paddle.sum(inputs, axis=0)

    @staticmethod
    def backward(ctx, grad_output):
        dist.all_reduce(grad_output, sync_op=True)
        return grad_output


def differentiable_all_reduce(input):
    """
    Differentiable counterpart of `dist.all_reduce`.
    """
    if (
        not dist.is_available()
        or not dist.is_initialized()
        or dist.get_world_size() == 1
    ):
        return input
    return _AllReduce.apply(input)


class NaiveSyncBatchNorm(nn.BatchNorm2D):

    def __init__(self, *args, stats_mode="", **kwargs):
        super().__init__(*args, **kwargs)
        assert stats_mode in ["", "N"]
        self._stats_mode = stats_mode

    def forward(self, input):
        if dist.get_world_size() == 1 or not self.training:
            return super(NaiveSyncBatchNorm, self).forward(input)

        B, C = input.shape[0], input.shape[1]

        mean = paddle.mean(input, axis=[0, 2, 3])
        meansqr = paddle.mean(input * input, axis=[0, 2, 3])

        if self._stats_mode == "":
            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
            vec = paddle.concat([mean, meansqr], axis=0)
            vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
            mean, meansqr = paddle.split(vec, [C, C])
            momentum = 1 - self._momentum # NOTE: paddle has reverse momentum defination
        else:
            if B == 0:
                vec = paddle.zeros([2 * C + 1], dtype=mean.dtype)
                vec = vec + input.sum()  # make sure there is gradient w.r.t input
            else:
                vec = paddle.concat(
                    [
                        mean,
                        meansqr,
                        paddle.ones([1], dtype=mean.dtype),
                    ],
                    axis=0,
                )
            vec = differentiable_all_reduce(vec * B)

            total_batch = vec[-1].detach()
            momentum = total_batch.clip(max=1) * (1 - self._momentum)  # no update if total_batch is 0
            mean, meansqr, _ = paddle.split(vec / total_batch.clip(min=1), [C, C, int(vec.shape[0] - 2*C)])  # avoid div-by-zero

        var = meansqr - mean * mean
        invstd = paddle.rsqrt(var + self._epsilon)
        scale = self.weight * invstd
        bias = self.bias - mean * scale
        scale = scale.reshape([1, -1, 1, 1])
        bias = bias.reshape([1, -1, 1, 1])

        tmp_mean = self._mean + momentum * (mean.detach() - self._mean)
        self._mean.set_value(tmp_mean)
        tmp_variance = self._variance + (momentum * (var.detach() - self._variance))
        self._variance.set_value(tmp_variance)
        ret = input * scale + bias
        return ret


def convert_syncbn(model):
    for n, m in model.named_children():
        if isinstance(m, nn.layer.norm._BatchNormBase):
            syncbn = NaiveSyncBatchNorm(m._num_features, m._momentum, m._epsilon, m._weight_attr, m._bias_attr)
            setattr(model, n, syncbn)
        else:
            convert_syncbn(m)

================================================
FILE: ppdet/engine/tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import glob
import re
import paddle
import paddle.nn as nn
import numpy as np
from tqdm import tqdm
from collections import defaultdict

from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
from ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
from ppdet.modeling.mot.tracker import JDETracker, CenterTracker
from ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker
from ppdet.modeling.architectures import YOLOX
from ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
from ppdet.data.source.category import get_categories
import ppdet.utils.stats as stats

from .callbacks import Callback, ComposeCallback

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
MOT_ARCH_JDE = MOT_ARCH[:2]
MOT_ARCH_SDE = MOT_ARCH[2:4]
MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']

__all__ = ['Tracker']


class Tracker(object):
    def __init__(self, cfg, mode='eval'):
        self.cfg = cfg
        assert mode.lower() in ['test', 'eval'], \
                "mode should be 'test' or 'eval'"
        self.mode = mode.lower()
        self.optimizer = None

        # build MOT data loader
        self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]

        # build model
        self.model = create(cfg.architecture)

        if isinstance(self.model.detector, YOLOX):
            for k, m in self.model.named_sublayers():
                if isinstance(m, nn.BatchNorm2D):
                    m._epsilon = 1e-3  # for amp(fp16)
                    m._momentum = 0.97  # 0.03 in pytorch

        anno_file = self.dataset.get_anno()
        clsid2catid, catid2name = get_categories(
            self.cfg.metric, anno_file=anno_file)
        self.ids2names = []
        for k, v in catid2name.items():
            self.ids2names.append(v)

        self.status = {}
        self.start_epoch = 0

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()

    def _init_callbacks(self):
        self._callbacks = []
        self._compose_callback = None

    def _init_metrics(self):
        if self.mode in ['test']:
            self._metrics = []
            return

        if self.cfg.metric == 'MOT':
            self._metrics = [MOTMetric(), ]
        elif self.cfg.metric == 'MCMOT':
            self._metrics = [MCMOTMetric(self.cfg.num_classes), ]
        elif self.cfg.metric == 'KITTI':
            self._metrics = [KITTIMOTMetric(), ]
        else:
            logger.warning("Metric not support for metric type {}".format(
                self.cfg.metric))
            self._metrics = []

    def _reset_metrics(self):
        for metric in self._metrics:
            metric.reset()

    def register_callbacks(self, callbacks):
        callbacks = [h for h in list(callbacks) if h is not None]
        for c in callbacks:
            assert isinstance(c, Callback), \
                    "metrics shoule be instances of subclass of Metric"
        self._callbacks.extend(callbacks)
        self._compose_callback = ComposeCallback(self._callbacks)

    def register_metrics(self, metrics):
        metrics = [m for m in list(metrics) if m is not None]
        for m in metrics:
            assert isinstance(m, Metric), \
                    "metrics shoule be instances of subclass of Metric"
        self._metrics.extend(metrics)

    def load_weights_jde(self, weights):
        load_weight(self.model, weights, self.optimizer)

    def load_weights_sde(self, det_weights, reid_weights):
        with_detector = self.model.detector is not None
        with_reid = self.model.reid is not None

        if with_detector:
            load_weight(self.model.detector, det_weights)
            if with_reid:
                load_weight(self.model.reid, reid_weights)
        else:
            load_weight(self.model.reid, reid_weights)

    def _eval_seq_centertrack(self,
                              dataloader,
                              save_dir=None,
                              show_image=False,
                              frame_rate=30,
                              draw_threshold=0):
        assert isinstance(self.model.tracker, CenterTracker)
        if save_dir:
            if not os.path.exists(save_dir): os.makedirs(save_dir)
        tracker = self.model.tracker

        timer = MOTTimer()
        frame_id = 0
        self.status['mode'] = 'track'
        self.model.eval()
        results = defaultdict(list)  # only support single class now

        for step_id, data in enumerate(tqdm(dataloader)):
            self.status['step_id'] = step_id
            if step_id == 0:
                self.model.reset_tracking()

            # forward
            timer.tic()
            pred_ret = self.model(data)

            online_targets = tracker.update(pred_ret)
            online_tlwhs, online_scores, online_ids = [], [], []
            for t in online_targets:
                bbox = t['bbox']
                tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
                tscore = float(t['score'])
                tid = int(t['tracking_id'])
                if tlwh[2] * tlwh[3] > 0:
                    online_tlwhs.append(tlwh)
                    online_ids.append(tid)
                    online_scores.append(tscore)
            timer.toc()
            # save results
            results[0].append(
                (frame_id + 1, online_tlwhs, online_scores, online_ids))
            save_vis_results(data, frame_id, online_ids, online_tlwhs,
                             online_scores, timer.average_time, show_image,
                             save_dir, self.cfg.num_classes, self.ids2names)
            frame_id += 1
        return results, frame_id, timer.average_time, timer.calls

    def _eval_seq_jde(self,
                      dataloader,
                      save_dir=None,
                      show_image=False,
                      frame_rate=30,
                      draw_threshold=0):
        if save_dir:
            if not os.path.exists(save_dir): os.makedirs(save_dir)
        tracker = self.model.tracker
        tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer)

        timer = MOTTimer()
        frame_id = 0
        self.status['mode'] = 'track'
        self.model.eval()
        results = defaultdict(list)  # support single class and multi classes

        for step_id, data in enumerate(tqdm(dataloader)):
            self.status['step_id'] = step_id
            # forward
            timer.tic()
            pred_dets, pred_embs = self.model(data)

            pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy()
            online_targets_dict = self.model.tracker.update(pred_dets,
                                                            pred_embs)
            online_tlwhs = defaultdict(list)
            online_scores = defaultdict(list)
            online_ids = defaultdict(list)
            for cls_id in range(self.cfg.num_classes):
                online_targets = online_targets_dict[cls_id]
                for t in online_targets:
                    tlwh = t.tlwh
                    tid = t.track_id
                    tscore = t.score
                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
                            3] > tracker.vertical_ratio:
                        continue
                    online_tlwhs[cls_id].append(tlwh)
                    online_ids[cls_id].append(tid)
                    online_scores[cls_id].append(tscore)
                # save results
                results[cls_id].append(
                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
                     online_ids[cls_id]))

            timer.toc()
            save_vis_results(data, frame_id, online_ids, online_tlwhs,
                             online_scores, timer.average_time, show_image,
                             save_dir, self.cfg.num_classes, self.ids2names)
            frame_id += 1

        return results, frame_id, timer.average_time, timer.calls

    def _eval_seq_sde(self,
                      dataloader,
                      save_dir=None,
                      show_image=False,
                      frame_rate=30,
                      seq_name='',
                      scaled=False,
                      det_file='',
                      draw_threshold=0):
        if save_dir:
            if not os.path.exists(save_dir): os.makedirs(save_dir)
        use_detector = False if not self.model.detector else True
        use_reid = hasattr(self.model, 'reid')
        if use_reid and self.model.reid is not None:
            use_reid = True
        else:
            use_reid = False

        timer = MOTTimer()
        results = defaultdict(list)
        frame_id = 0
        self.status['mode'] = 'track'
        self.model.eval()
        if use_reid:
            self.model.reid.eval()
        if not use_detector:
            dets_list = load_det_results(det_file, len(dataloader))
            logger.info('Finish loading detection results file {}.'.format(
                det_file))

        tracker = self.model.tracker
        for step_id, data in enumerate(tqdm(dataloader)):
            self.status['step_id'] = step_id
            ori_image = data['ori_image']  # [bs, H, W, 3]
            ori_image_shape = data['ori_image'].shape[1:3]
            # ori_image_shape: [H, W]

            input_shape = data['image'].shape[2:]
            # input_shape: [h, w], before data transforms, set in model config

            im_shape = data['im_shape'][0].numpy()
            # im_shape: [new_h, new_w], after data transforms
            scale_factor = data['scale_factor'][0].numpy()

            empty_detections = False
            # when it has no detected bboxes, will not inference reid model 
            # and if visualize, use original image instead

            # forward
            timer.tic()
            if not use_detector:
                dets = dets_list[frame_id]
                bbox_tlwh = np.array(dets['bbox'], dtype='float32')
                if bbox_tlwh.shape[0] > 0:
                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
                    pred_cls_ids = np.array(dets['cls_id'], dtype='float32')
                    pred_scores = np.array(dets['score'], dtype='float32')
                    pred_bboxes = np.concatenate(
                        (bbox_tlwh[:, 0:2],
                         bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]),
                        axis=1)
                else:
                    logger.warning(
                        'Frame {} has not object, try to modify score threshold.'.
                        format(frame_id))
                    empty_detections = True
            else:
                outs = self.model.detector(data)
                outs['bbox'] = outs['bbox'].numpy()
                outs['bbox_num'] = outs['bbox_num'].numpy()

                if len(outs['bbox']) > 0 and empty_detections == False:
                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
                    pred_cls_ids = outs['bbox'][:, 0:1]
                    pred_scores = outs['bbox'][:, 1:2]
                    if not scaled:
                        # Note: scaled=False only in JDE YOLOv3 or other detectors
                        # with LetterBoxResize and JDEBBoxPostProcess.
                        #
                        # 'scaled' means whether the coords after detector outputs
                        # have been scaled back to the original image, set True 
                        # in general detector, set False in JDE YOLOv3.
                        pred_bboxes = scale_coords(outs['bbox'][:, 2:],
                                                   input_shape, im_shape,
                                                   scale_factor)
                    else:
                        pred_bboxes = outs['bbox'][:, 2:]
                    pred_dets_old = np.concatenate(
                        (pred_cls_ids, pred_scores, pred_bboxes), axis=1)
                else:
                    logger.warning(
                        'Frame {} has not detected object, try to modify score threshold.'.
                        format(frame_id))
                    empty_detections = True

            if not empty_detections:
                pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape)
                if len(keep_idx[0]) == 0:
                    logger.warning(
                        'Frame {} has not detected object left after clip_box.'.
                        format(frame_id))
                    empty_detections = True

            if empty_detections:
                timer.toc()
                # if visualize, use original image instead
                online_ids, online_tlwhs, online_scores = None, None, None
                save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                 online_scores, timer.average_time, show_image,
                                 save_dir, self.cfg.num_classes, self.ids2names)
                frame_id += 1
                # thus will not inference reid model
                continue

            pred_cls_ids = pred_cls_ids[keep_idx[0]]
            pred_scores = pred_scores[keep_idx[0]]
            pred_dets = np.concatenate(
                (pred_cls_ids, pred_scores, pred_xyxys), axis=1)

            if use_reid:
                crops = get_crops(
                    pred_xyxys,
                    ori_image,
                    w=tracker.input_size[0],
                    h=tracker.input_size[1])
                crops = paddle.to_tensor(crops)

                data.update({'crops': crops})
                pred_embs = self.model(data)['embeddings'].numpy()
            else:
                pred_embs = None

            if isinstance(tracker, DeepSORTTracker):
                online_tlwhs, online_scores, online_ids = [], [], []
                tracker.predict()
                online_targets = tracker.update(pred_dets, pred_embs)
                for t in online_targets:
                    if not t.is_confirmed() or t.time_since_update > 1:
                        continue
                    tlwh = t.to_tlwh()
                    tscore = t.score
                    tid = t.track_id
                    if tscore < draw_threshold: continue
                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
                            3] > tracker.vertical_ratio:
                        continue
                    online_tlwhs.append(tlwh)
                    online_scores.append(tscore)
                    online_ids.append(tid)
                timer.toc()

                # save results
                results[0].append(
                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
                save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                 online_scores, timer.average_time, show_image,
                                 save_dir, self.cfg.num_classes, self.ids2names)

            elif isinstance(tracker, JDETracker):
                # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
                tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams(
                    seq_name, tracker.track_buffer, tracker.conf_thres)

                online_targets_dict = tracker.update(pred_dets_old, pred_embs)
                online_tlwhs = defaultdict(list)
                online_scores = defaultdict(list)
                online_ids = defaultdict(list)
                for cls_id in range(self.cfg.num_classes):
                    online_targets = online_targets_dict[cls_id]
                    for t in online_targets:
                        tlwh = t.tlwh
                        tid = t.track_id
                        tscore = t.score
                        if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
                        if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
                                3] > tracker.vertical_ratio:
                            continue
                        online_tlwhs[cls_id].append(tlwh)
                        online_ids[cls_id].append(tid)
                        online_scores[cls_id].append(tscore)
                    # save results
                    results[cls_id].append(
                        (frame_id + 1, online_tlwhs[cls_id],
                         online_scores[cls_id], online_ids[cls_id]))
                timer.toc()
                save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                 online_scores, timer.average_time, show_image,
                                 save_dir, self.cfg.num_classes, self.ids2names)

            elif isinstance(tracker, OCSORTTracker):
                # OC_SORT Tracker
                online_targets = tracker.update(pred_dets_old, pred_embs)
                online_tlwhs = []
                online_ids = []
                online_scores = []
                for t in online_targets:
                    tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]
                    tscore = float(t[4])
                    tid = int(t[5])
                    if tlwh[2] * tlwh[3] > 0:
                        online_tlwhs.append(tlwh)
                        online_ids.append(tid)
                        online_scores.append(tscore)
                timer.toc()
                # save results
                results[0].append(
                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
                save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                 online_scores, timer.average_time, show_image,
                                 save_dir, self.cfg.num_classes, self.ids2names)

            elif isinstance(tracker, BOTSORTTracker):
                # BOTSORT Tracker
                online_targets = tracker.update(
                    pred_dets_old, img=ori_image.numpy())
                online_tlwhs = []
                online_ids = []
                online_scores = []
                for t in online_targets:
                    tlwh = t.tlwh
                    tid = t.track_id
                    tscore = t.score
                    if tlwh[2] * tlwh[3] > 0:
                        online_tlwhs.append(tlwh)
                        online_ids.append(tid)
                        online_scores.append(tscore)
                timer.toc()
                # save results
                results[0].append(
                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
                save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                 online_scores, timer.average_time, show_image,
                                 save_dir, self.cfg.num_classes, self.ids2names)

            else:
                raise ValueError(tracker)
            frame_id += 1

        return results, frame_id, timer.average_time, timer.calls

    def mot_evaluate(self,
                     data_root,
                     seqs,
                     output_dir,
                     data_type='mot',
                     model_type='JDE',
                     save_images=False,
                     save_videos=False,
                     show_image=False,
                     scaled=False,
                     det_results_dir=''):
        if not os.path.exists(output_dir): os.makedirs(output_dir)
        result_root = os.path.join(output_dir, 'mot_results')
        if not os.path.exists(result_root): os.makedirs(result_root)
        assert data_type in MOT_DATA_TYPE, \
            "data_type should be 'mot', 'mcmot' or 'kitti'"
        assert model_type in MOT_ARCH, \
            "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"

        # run tracking
        n_frame = 0
        timer_avgs, timer_calls = [], []
        for seq in seqs:
            infer_dir = os.path.join(data_root, seq)
            if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir):
                logger.warning("Seq {} error, {} has no images.".format(
                    seq, infer_dir))
                continue
            if os.path.exists(os.path.join(infer_dir, 'img1')):
                infer_dir = os.path.join(infer_dir, 'img1')

            frame_rate = 30
            seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')
            if os.path.exists(seqinfo):
                meta_info = open(seqinfo).read()
                frame_rate = int(meta_info[meta_info.find('frameRate') + 10:
                                           meta_info.find('\nseqLength')])

            save_dir = os.path.join(output_dir, 'mot_outputs',
                                    seq) if save_images or save_videos else None
            logger.info('Evaluate seq: {}'.format(seq))

            self.dataset.set_images(self.get_infer_images(infer_dir))
            dataloader = create('EvalMOTReader')(self.dataset, 0)

            result_filename = os.path.join(result_root, '{}.txt'.format(seq))

            with paddle.no_grad():
                if model_type in MOT_ARCH_JDE:
                    results, nf, ta, tc = self._eval_seq_jde(
                        dataloader,
                        save_dir=save_dir,
                        show_image=show_image,
                        frame_rate=frame_rate)
                elif model_type in MOT_ARCH_SDE:
                    results, nf, ta, tc = self._eval_seq_sde(
                        dataloader,
                        save_dir=save_dir,
                        show_image=show_image,
                        frame_rate=frame_rate,
                        seq_name=seq,
                        scaled=scaled,
                        det_file=os.path.join(det_results_dir,
                                              '{}.txt'.format(seq)))
                elif model_type == 'CenterTrack':
                    results, nf, ta, tc = self._eval_seq_centertrack(
                        dataloader,
                        save_dir=save_dir,
                        show_image=show_image,
                        frame_rate=frame_rate)
                else:
                    raise ValueError(model_type)

            write_mot_results(result_filename, results, data_type,
                              self.cfg.num_classes)
            n_frame += nf
            timer_avgs.append(ta)
            timer_calls.append(tc)

            if save_videos:
                output_video_path = os.path.join(save_dir, '..',
                                                 '{}_vis.mp4'.format(seq))
                cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
                    save_dir, output_video_path)
                os.system(cmd_str)
                logger.info('Save video in {}.'.format(output_video_path))

            # update metrics
            for metric in self._metrics:
                metric.update(data_root, seq, data_type, result_root,
                              result_filename)

        timer_avgs = np.asarray(timer_avgs)
        timer_calls = np.asarray(timer_calls)
        all_time = np.dot(timer_avgs, timer_calls)
        avg_time = all_time / np.sum(timer_calls)
        logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format(
            all_time, 1.0 / avg_time))

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

    def get_infer_images(self, infer_dir):
        assert infer_dir is None or os.path.isdir(infer_dir), \
            "{} is not a directory".format(infer_dir)
        images = set()
        assert os.path.isdir(infer_dir), \
            "infer_dir {} is not a directory".format(infer_dir)
        exts = ['jpg', 'jpeg', 'png', 'bmp']
        exts += [ext.upper() for ext in exts]
        for ext in exts:
            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
        images = list(images)
        images.sort()
        assert len(images) > 0, "no image found in {}".format(infer_dir)
        logger.info("Found {} inference images in total.".format(len(images)))
        return images

    def mot_predict_seq(self,
                        video_file,
                        frame_rate,
                        image_dir,
                        output_dir,
                        data_type='mot',
                        model_type='JDE',
                        save_images=False,
                        save_videos=True,
                        show_image=False,
                        scaled=False,
                        det_results_dir='',
                        draw_threshold=0.5):
        assert video_file is not None or image_dir is not None, \
            "--video_file or --image_dir should be set."
        assert video_file is None or os.path.isfile(video_file), \
                "{} is not a file".format(video_file)
        assert image_dir is None or os.path.isdir(image_dir), \
                "{} is not a directory".format(image_dir)

        if not os.path.exists(output_dir): os.makedirs(output_dir)
        result_root = os.path.join(output_dir, 'mot_results')
        if not os.path.exists(result_root): os.makedirs(result_root)
        assert data_type in MOT_DATA_TYPE, \
            "data_type should be 'mot', 'mcmot' or 'kitti'"
        assert model_type in MOT_ARCH, \
            "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"

        # run tracking        
        if video_file:
            seq = video_file.split('/')[-1].split('.')[0]
            self.dataset.set_video(video_file, frame_rate)
            logger.info('Starting tracking video {}'.format(video_file))
        elif image_dir:
            seq = image_dir.split('/')[-1].split('.')[0]
            if os.path.exists(os.path.join(image_dir, 'img1')):
                image_dir = os.path.join(image_dir, 'img1')
            images = [
                '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)
            ]
            images.sort()
            self.dataset.set_images(images)
            logger.info('Starting tracking folder {}, found {} images'.format(
                image_dir, len(images)))
        else:
            raise ValueError('--video_file or --image_dir should be set.')

        save_dir = os.path.join(output_dir, 'mot_outputs',
                                seq) if save_images or save_videos else None

        dataloader = create('TestMOTReader')(self.dataset, 0)
        result_filename = os.path.join(result_root, '{}.txt'.format(seq))
        if frame_rate == -1:
            frame_rate = self.dataset.frame_rate

        with paddle.no_grad():
            if model_type in MOT_ARCH_JDE:
                results, nf, ta, tc = self._eval_seq_jde(
                    dataloader,
                    save_dir=save_dir,
                    show_image=show_image,
                    frame_rate=frame_rate,
                    draw_threshold=draw_threshold)
            elif model_type in MOT_ARCH_SDE:
                results, nf, ta, tc = self._eval_seq_sde(
                    dataloader,
                    save_dir=save_dir,
                    show_image=show_image,
                    frame_rate=frame_rate,
                    seq_name=seq,
                    scaled=scaled,
                    det_file=os.path.join(det_results_dir,
                                          '{}.txt'.format(seq)),
                    draw_threshold=draw_threshold)
            elif model_type == 'CenterTrack':
                results, nf, ta, tc = self._eval_seq_centertrack(
                    dataloader,
                    save_dir=save_dir,
                    show_image=show_image,
                    frame_rate=frame_rate)
            else:
                raise ValueError(model_type)

        if save_videos:
            output_video_path = os.path.join(save_dir, '..',
                                             '{}_vis.mp4'.format(seq))
            cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
                save_dir, output_video_path)
            os.system(cmd_str)
            logger.info('Save video in {}'.format(output_video_path))

        write_mot_results(result_filename, results, data_type,
                          self.cfg.num_classes)


def get_trick_hyperparams(video_name, ori_buffer, ori_thresh):
    if video_name[:3] != 'MOT':
        # only used for MOTChallenge (MOT17, MOT20) Test-set
        return ori_buffer, ori_thresh

    video_name = video_name[:8]
    if 'MOT17-05' in video_name:
        track_buffer = 14
    elif 'MOT17-13' in video_name:
        track_buffer = 25
    else:
        track_buffer = ori_buffer

    if 'MOT17-01' in video_name:
        track_thresh = 0.65
    elif 'MOT17-06' in video_name:
        track_thresh = 0.65
    elif 'MOT17-12' in video_name:
        track_thresh = 0.7
    elif 'MOT17-14' in video_name:
        track_thresh = 0.67
    else:
        track_thresh = ori_thresh

    if 'MOT20-06' in video_name or 'MOT20-08' in video_name:
        track_thresh = 0.3
    else:
        track_thresh = ori_thresh

    return track_buffer, ori_thresh


================================================
FILE: ppdet/engine/trainer.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import copy
import time
import yaml
from tqdm import tqdm

import numpy as np
import typing
from PIL import Image, ImageOps, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

import paddle
import paddle.nn as nn
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.static import InputSpec
from ppdet.optimizer import ModelEMA

from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, convert_to_dict
from ppdet.utils.visualizer import visualize_results, save_result
from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval
from ppdet.metrics import Metric, COCOMetric, LVISMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric
from ppdet.data.source.sniper_coco import SniperCOCODataSet
from ppdet.data.source.category import get_categories
import ppdet.utils.stats as stats
from ppdet.utils.fuse_utils import fuse_conv_bn
from ppdet.utils import profiler
from ppdet.modeling.post_process import multiclass_nms
from ppdet.modeling.lane_utils import imshow_lanes

from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter
from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
from .naive_sync_bn import convert_syncbn

from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients

from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')

__all__ = ['Trainer']

MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']


class Trainer(object):
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg.copy()
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
        self.optimizer = None
        self.is_loaded_weights = False
        self.use_amp = self.cfg.get('amp', False)
        self.amp_level = self.cfg.get('amp_level', 'O1')
        self.custom_white_list = self.cfg.get('custom_white_list', None)
        self.custom_black_list = self.cfg.get('custom_black_list', None)
        self.use_master_grad = self.cfg.get('master_grad', False)
        self.uniform_output_enabled = self.cfg.get('uniform_output_enabled', False)
        if ('slim' in cfg and cfg['slim_type'] == 'PTQ') or self.uniform_output_enabled:
            self.cfg['TestDataset'] = create('TestDataset')()
        log_ranks = cfg.get('log_ranks', '0')
        if isinstance(log_ranks, str):
            self.log_ranks = [int(i) for i in log_ranks.split(',')]
        elif isinstance(log_ranks, int):
            self.log_ranks = [log_ranks]
        train_results_path = os.path.abspath(os.path.join(self.cfg.save_dir, "train_result.json"))
        if self.uniform_output_enabled:
            if os.path.exists(train_results_path) and self.mode == 'train':
                try:
                    os.remove(train_results_path)
                except:
                    pass
            if not os.path.exists(self.cfg.save_dir):
                os.mkdir(self.cfg.save_dir)
            with open(os.path.join(self.cfg.save_dir, "config.yaml"), "w") as f:
                config_dict = convert_to_dict(self.cfg)
                config_dict = {k: v for k, v in config_dict.items() if v != {}}
                yaml.dump(config_dict, f)

        # build data loader
        capital_mode = self.mode.capitalize()
        if cfg.architecture in MOT_ARCH and self.mode in [
                'eval', 'test'
        ] and cfg.metric not in ['COCO', 'VOC']:
            self.dataset = self.cfg['{}MOTDataset'.format(
                capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
        else:
            self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
                '{}Dataset'.format(capital_mode))()

        if cfg.architecture == 'DeepSORT' and self.mode == 'train':
            logger.error('DeepSORT has no need of training on mot dataset.')
            sys.exit(1)

        if cfg.architecture == 'FairMOT' and self.mode == 'eval':
            images = self.parse_mot_images(cfg)
            self.dataset.set_images(images)

        if self.mode == 'train':
            self.loader = create('{}Reader'.format(capital_mode))(
                self.dataset, cfg.worker_num)

        if cfg.architecture == 'JDE' and self.mode == 'train':
            self.cfg['JDEEmbeddingHead'][
                'num_identities'] = self.dataset.num_identities_dict[0]
            # JDE only support single class MOT now.

        if cfg.architecture == 'FairMOT' and self.mode == 'train':
            self.cfg['FairMOTEmbeddingHead'][
                'num_identities_dict'] = self.dataset.num_identities_dict
            # FairMOT support single class and multi-class MOT now.

        # build model
        if 'model' not in self.cfg:
            self.model = create(cfg.architecture)
        else:
            self.model = self.cfg.model
            self.is_loaded_weights = True

        if cfg.architecture == 'YOLOX':
            for k, m in self.model.named_sublayers():
                if isinstance(m, nn.BatchNorm2D):
                    m._epsilon = 1e-3  # for amp(fp16)
                    m._momentum = 0.97  # 0.03 in pytorch

        # reset norm param attr for setting them in optimizer
        if 'reset_norm_param_attr' in cfg and cfg['reset_norm_param_attr']:
            self.model = self.reset_norm_param_attr(
                self.model, weight_attr=None, bias_attr=None)

        # normalize params for deploy
        if 'slim' in cfg and cfg['slim_type'] == 'OFA':
            self.model.model.load_meanstd(cfg['TestReader'][
                'sample_transforms'])
        elif 'slim' in cfg and cfg['slim_type'] == 'Distill':
            self.model.student_model.load_meanstd(cfg['TestReader'][
                'sample_transforms'])
        elif 'slim' in cfg and cfg[
                'slim_type'] == 'DistillPrune' and self.mode == 'train':
            self.model.student_model.load_meanstd(cfg['TestReader'][
                'sample_transforms'])
        else:
            self.model.load_meanstd(cfg['TestReader']['sample_transforms'])

        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            if cfg.architecture == 'FairMOT':
                self.loader = create('EvalMOTReader')(self.dataset, 0)
            elif cfg.architecture == "METRO_Body":
                reader_name = '{}Reader'.format(self.mode.capitalize())
                self.loader = create(reader_name)(self.dataset, cfg.worker_num)
            else:
                self._eval_batch_sampler = paddle.io.BatchSampler(
                    self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
                reader_name = '{}Reader'.format(self.mode.capitalize())
                # If metric is VOC, need to be set collate_batch=False.
                if cfg.metric == 'VOC':
                    self.cfg[reader_name]['collate_batch'] = False
                self.loader = create(reader_name)(self.dataset, cfg.worker_num,
                                                  self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here

        # get Params
        print_params = self.cfg.get('print_params', False)
        if print_params:
            params = sum([
                p.numel() for n, p in self.model.named_parameters()
                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
            ])  # exclude BatchNorm running status
            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
                0]))

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = len(self.loader)
            if steps_per_epoch < 1:
                logger.warning(
                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
                )
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)

            # Unstructured pruner is only enabled in the train mode.
            if self.cfg.get('unstructured_prune'):
                self.pruner = create('UnstructuredPruner')(self.model,
                                                           steps_per_epoch)
        if self.use_amp and self.amp_level == 'O2':
            paddle_version = paddle.__version__[:3]
            # paddle version >= 2.5.0 or develop
            if paddle_version in ["2.5", "0.0"]:
                self.model, self.optimizer = paddle.amp.decorate(
                    models=self.model,
                    optimizers=self.optimizer,
                    level=self.amp_level,
                    master_grad=self.use_master_grad)
            else:
                self.model, self.optimizer = paddle.amp.decorate(
                    models=self.model,
                    optimizers=self.optimizer,
                    level=self.amp_level)

        # support sync_bn for npu/xpu
        if (paddle.get_device()[:3]=='npu' or paddle.get_device()[:3]=='xpu'):
            use_npu = ('use_npu' in cfg and cfg['use_npu'])
            use_xpu = ('use_xpu' in cfg and cfg['use_xpu'])
            use_mlu = ('use_mlu' in cfg and cfg['use_mlu'])
            norm_type = ('norm_type' in cfg and cfg['norm_type'])
            if norm_type == 'sync_bn' and (use_npu or use_xpu or use_mlu) and dist.get_world_size() > 1:
                convert_syncbn(self.model)

        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
        if self.use_ema:
            ema_decay = self.cfg.get('ema_decay', 0.9998)
            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
            cycle_epoch = self.cfg.get('cycle_epoch', -1)
            ema_black_list = self.cfg.get('ema_black_list', None)
            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
            self.ema = ModelEMA(
                self.model,
                decay=ema_decay,
                ema_decay_type=ema_decay_type,
                cycle_epoch=cycle_epoch,
                ema_black_list=ema_black_list,
                ema_filter_no_grad=ema_filter_no_grad)

        self._nranks = dist.get_world_size()
        self._local_rank = dist.get_rank()

        self.status = {}

        self.start_epoch = 0
        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()

    def _init_callbacks(self):
        if self.mode == 'train':
            if self.cfg.get('ssod_method',
                            False) and self.cfg['ssod_method'] == 'Semi_RTDETR':
                self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)]
            else:
                self._callbacks = [LogPrinter(self), Checkpointer(self)]
            if self.cfg.get('use_vdl', False):
                self._callbacks.append(VisualDLWriter(self))
            if self.cfg.get('save_proposals', False):
                self._callbacks.append(SniperProposalsGenerator(self))
            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
                self._callbacks.append(WandbCallback(self))
            self._compose_callback = ComposeCallback(self._callbacks)
        elif self.mode == 'eval':
            self._callbacks = [LogPrinter(self)]
            # if self.cfg.metric == 'WiderFace':
            #     self._callbacks.append(WiferFaceEval(self))
            self._compose_callback = ComposeCallback(self._callbacks)
        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
            self._callbacks = [VisualDLWriter(self)]
            self._compose_callback = ComposeCallback(self._callbacks)
        else:
            self._callbacks = []
            self._compose_callback = None

    def _init_metrics(self, validate=False):
        if self.mode == 'test' or (self.mode == 'train' and not validate):
            self._metrics = []
            return
        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
        if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO" or self.cfg.metric == 'LVIS':
            # TODO: bias should be unified
            bias = 1 if self.cfg.get('bias', False) else 0
            output_eval = self.cfg['output_eval'] \
                if 'output_eval' in self.cfg else None
            save_prediction_only = self.cfg.get('save_prediction_only', False)

            # pass clsid2catid info to metric instance to avoid multiple loading
            # annotation file
            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
                                if self.mode == 'eval' else None

            save_threshold = self.cfg.get('save_threshold', 0)

            # when do validation in train, annotation file should be get from
            # EvalReader instead of self.dataset(which is TrainReader)
            if self.mode == 'train' and validate:
                eval_dataset = self.cfg['EvalDataset']
                eval_dataset.check_or_download_dataset()
                anno_file = eval_dataset.get_anno()
                dataset = eval_dataset
            else:
                dataset = self.dataset
                anno_file = dataset.get_anno()

            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
            if self.cfg.metric == "COCO":
                self._metrics = [
                    COCOMetric(
                        anno_file=anno_file,
                        clsid2catid=clsid2catid,
                        classwise=classwise,
                        output_eval=output_eval,
                        bias=bias,
                        IouType=IouType,
                        save_prediction_only=save_prediction_only,
                        save_threshold=save_threshold)
                ]
            elif self.cfg.metric == "LVIS":
                self._metrics = [
                    LVISMetric(
                        anno_file=anno_file,
                        clsid2catid=clsid2catid,
                        classwise=classwise,
                        output_eval=output_eval,
                        bias=bias,
                        IouType=IouType,
                        save_prediction_only=save_prediction_only)
                ]
            elif self.cfg.metric == "SNIPERCOCO":  # sniper
                self._metrics = [
                    SNIPERCOCOMetric(
                        anno_file=anno_file,
                        dataset=dataset,
                        clsid2catid=clsid2catid,
                        classwise=classwise,
                        output_eval=output_eval,
                        bias=bias,
                        IouType=IouType,
                        save_prediction_only=save_prediction_only)
                ]
        elif self.cfg.metric == 'RBOX':
            # TODO: bias should be unified
            bias = self.cfg['bias'] if 'bias' in self.cfg else 0
            output_eval = self.cfg['output_eval'] \
                if 'output_eval' in self.cfg else None
            save_prediction_only = self.cfg.get('save_prediction_only', False)
            imid2path = self.cfg.get('imid2path', None)

            # when do validation in train, annotation file should be get from
            # EvalReader instead of self.dataset(which is TrainReader)
            anno_file = self.dataset.get_anno()
            if self.mode == 'train' and validate:
                eval_dataset = self.cfg['EvalDataset']
                eval_dataset.check_or_download_dataset()
                anno_file = eval_dataset.get_anno()

            self._metrics = [
                RBoxMetric(
                    anno_file=anno_file,
                    classwise=classwise,
                    output_eval=output_eval,
                    bias=bias,
                    save_prediction_only=save_prediction_only,
                    imid2path=imid2path)
            ]
        elif self.cfg.metric == 'VOC':
            output_eval = self.cfg['output_eval'] \
                if 'output_eval' in self.cfg else None
            save_prediction_only = self.cfg.get('save_prediction_only', False)

            self._metrics = [
                VOCMetric(
                    label_list=self.dataset.get_label_list(),
                    class_num=self.cfg.num_classes,
                    map_type=self.cfg.map_type,
                    classwise=classwise,
                    output_eval=output_eval,
                    save_prediction_only=save_prediction_only)
            ]
        elif self.cfg.metric == 'WiderFace':
            self._metrics = [
                WiderFaceMetric()
            ]
        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
            eval_dataset = self.cfg['EvalDataset']
            eval_dataset.check_or_download_dataset()
            anno_file = eval_dataset.get_anno()
            save_prediction_only = self.cfg.get('save_prediction_only', False)
            self._metrics = [
                KeyPointTopDownCOCOEval(
                    anno_file,
                    len(eval_dataset),
                    self.cfg.num_joints,
                    self.cfg.save_dir,
                    save_prediction_only=save_prediction_only)
            ]
        elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval':
            eval_dataset = self.cfg['EvalDataset']
            eval_dataset.check_or_download_dataset()
            anno_file = eval_dataset.get_anno()
            save_prediction_only = self.cfg.get('save_prediction_only', False)
            self._metrics = [
                KeyPointTopDownCOCOWholeBadyHandEval(
                    anno_file,
                    len(eval_dataset),
                    self.cfg.num_joints,
                    self.cfg.save_dir,
                    save_prediction_only=save_prediction_only)
            ]
        elif self.cfg.metric == 'KeyPointTopDownMPIIEval':
            eval_dataset = self.cfg['EvalDataset']
            eval_dataset.check_or_download_dataset()
            anno_file = eval_dataset.get_anno()
            save_prediction_only = self.cfg.get('save_prediction_only', False)
            self._metrics = [
                KeyPointTopDownMPIIEval(
                    anno_file,
                    len(eval_dataset),
                    self.cfg.num_joints,
                    self.cfg.save_dir,
                    save_prediction_only=save_prediction_only)
            ]
        elif self.cfg.metric == 'Pose3DEval':
            save_prediction_only = self.cfg.get('save_prediction_only', False)
            self._metrics = [
                Pose3DEval(
                    self.cfg.save_dir,
                    save_prediction_only=save_prediction_only)
            ]
        elif self.cfg.metric == 'MOTDet':
            self._metrics = [JDEDetMetric(), ]
        elif self.cfg.metric == 'CULaneMetric':
            output_eval = self.cfg.get('output_eval', None)
            self._metrics = [
                CULaneMetric(
                    cfg=self.cfg,
                    output_eval=output_eval,
                    split=self.dataset.split,
                    dataset_dir=self.cfg.dataset_dir)
            ]
        else:
            logger.warning("Metric not support for metric type {}".format(
                self.cfg.metric))
            self._metrics = []

    def _reset_metrics(self):
        for metric in self._metrics:
            metric.reset()

    def register_callbacks(self, callbacks):
        callbacks = [c for c in list(callbacks) if c is not None]
        for c in callbacks:
            assert isinstance(c, Callback), \
                    "metrics shoule be instances of subclass of Metric"
        self._callbacks.extend(callbacks)
        self._compose_callback = ComposeCallback(self._callbacks)

    def register_metrics(self, metrics):
        metrics = [m for m in list(metrics) if m is not None]
        for m in metrics:
            assert isinstance(m, Metric), \
                    "metrics shoule be instances of subclass of Metric"
        self._metrics.extend(metrics)

    def load_weights(self, weights, ARSL_eval=False):
        if self.is_loaded_weights:
            return
        self.start_epoch = 0
        load_pretrain_weight(self.model, weights, ARSL_eval)
        logger.debug("Load weights {} to start training".format(weights))

    def load_weights_sde(self, det_weights, reid_weights):
        if self.model.detector:
            load_weight(self.model.detector, det_weights)
            if self.model.reid:
                load_weight(self.model.reid, reid_weights)
        else:
            load_weight(self.model.reid, reid_weights)

    def resume_weights(self, weights):
        # support Distill resume weights
        if hasattr(self.model, 'student_model'):
            self.start_epoch = load_weight(self.model.student_model, weights,
                                           self.optimizer)
        else:
            self.start_epoch = load_weight(self.model, weights, self.optimizer,
                                           self.ema if self.use_ema else None)
        logger.debug("Resume weights of epoch {}".format(self.start_epoch))

    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False
        if validate:
            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
                "EvalDataset")()

        model = self.model
        if self.cfg.get('to_static', False):
            model = apply_to_static(self.cfg, model)
        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
                   self.cfg.use_gpu and self._nranks > 1)
        if sync_bn:
            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)

        # enabel auto mixed precision mode
        if self.use_amp:
            scaler = paddle.amp.GradScaler(
                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
        # get distributed model
        if self.cfg.get('fleet', False):
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                model, find_unused_parameters=find_unused_parameters)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num)
            self._flops(flops_loader)
        profiler_options = self.cfg.get('profiler_options', None)

        self._compose_callback.on_train_begin(self.status)

        use_fused_allreduce_gradients = self.cfg[
            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                def deep_pin(blob, blocking):
                    if isinstance(blob, paddle.Tensor):
                        return blob.cuda(blocking=blocking)
                    elif isinstance(blob, dict):
                        return {k: deep_pin(v, blocking) for k, v in blob.items()}
                    elif isinstance(blob, (list, tuple)):
                        return type(blob)([deep_pin(x, blocking) for x in blob])
                    else:
                        return blob
                # if paddle.base.core.is_compiled_with_cuda():
                #     data = deep_pin(data, False)

                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)
                data['epoch_id'] = epoch_id
                if self.cfg.get('to_static',
                                False) and 'image_file' in data.keys():
                    data.pop('image_file')

                if self.use_amp:
                    if isinstance(
                            model, paddle.
                            DataParallel) and use_fused_allreduce_gradients:
                        with model.no_sync():
                            with paddle.amp.auto_cast(
                                    enable=self.cfg.use_gpu or
                                    self.cfg.use_npu or self.cfg.use_mlu,
                                    custom_white_list=self.custom_white_list,
                                    custom_black_list=self.custom_black_list,
                                    level=self.amp_level):
                                # model forward
                                outputs = model(data)
                                loss = outputs['loss']
                            # model backward
                            scaled_loss = scaler.scale(loss)
                            scaled_loss.backward()
                        fused_allreduce_gradients(
                            list(model.parameters()), None)
                    else:
                        with paddle.amp.auto_cast(
                                enable=self.cfg.use_gpu or self.cfg.use_npu or
                                self.cfg.use_mlu,
                                custom_white_list=self.custom_white_list,
                                custom_black_list=self.custom_black_list,
                                level=self.amp_level):
                            # model forward
                            outputs = model(data)
                            loss = outputs['loss']
                        # model backward
                        scaled_loss = scaler.scale(loss)
                        scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    if isinstance(
                            model, paddle.
                            DataParallel) and use_fused_allreduce_gradients:
                        with model.no_sync():
                            # model forward
                            outputs = model(data)
                            loss = outputs['loss']
                            # model backward
                            loss.backward()
                        fused_allreduce_gradients(
                            list(model.parameters()), None)
                    else:
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']
                        # model backward
                        loss.backward()
                    self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                if self.cfg.get('unstructured_prune'):
                    self.pruner.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank in self.log_ranks:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                if self.use_ema:
                    self.ema.update()
                iter_tic = time.time()

            if self.cfg.get('unstructured_prune'):
                self.pruner.update_params()

            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
            if is_snapshot and self.use_ema:
                # apply ema weight on model
                weight = copy.deepcopy(self.model.state_dict())
                self.model.set_dict(self.ema.apply())
                self.status['weight'] = weight

            self._compose_callback.on_epoch_end(self.status)

            if validate and is_snapshot:
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    # If metric is VOC, need to be set collate_batch=False.
                    if self.cfg.metric == 'VOC':
                        self.cfg['EvalReader']['collate_batch'] = False
                    if self.cfg.metric == "Pose3DEval":
                        self._eval_loader = create('EvalReader')(
                            self._eval_dataset, self.cfg.worker_num)
                    else:
                        self._eval_loader = create('EvalReader')(
                            self._eval_dataset,
                            self.cfg.worker_num,
                            batch_sampler=self._eval_batch_sampler)
                # if validation in training is enabled, metrics should be re-init
                # Init_mark makes sure this code will only execute once
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()

                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            if is_snapshot and self.use_ema:
                # reset original weight
                self.model.set_dict(weight)
                self.status.pop('weight')

        self._compose_callback.on_train_end(self.status)

    def _eval_with_loader(self, loader):
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
        self.status['mode'] = 'eval'

        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
            self._flops(flops_loader)
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            if self.use_amp:
                with paddle.amp.auto_cast(
                        enable=self.cfg.use_gpu or self.cfg.use_npu or
                        self.cfg.use_mlu,
                        custom_white_list=self.custom_white_list,
                        custom_black_list=self.custom_black_list,
                        level=self.amp_level):
                    outs = self.model(data)
            else:
                outs = self.model(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            # multi-scale inputs: all inputs have same im_id
            if isinstance(data, typing.Sequence):
                sample_num += data[0]['im_id'].numpy().shape[0]
            else:
                sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        self._compose_callback.on_epoch_end(self.status)
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

    def evaluate(self):
        # get distributed model
        if self.cfg.get('fleet', False):
            self.model = fleet.distributed_model(self.model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            self.model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)
        with paddle.no_grad():
            self._eval_with_loader(self.loader)

    def _eval_with_loader_slice(self,
                                loader,
                                slice_size=[640, 640],
                                overlap_ratio=[0.25, 0.25],
                                combine_method='nms',
                                match_threshold=0.6,
                                match_metric='iou'):
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
        self.status['mode'] = 'eval'
        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
            self._flops(flops_loader)

        merged_bboxs = []
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            if self.use_amp:
                with paddle.amp.auto_cast(
                        enable=self.cfg.use_gpu or self.cfg.use_npu or
                        self.cfg.use_mlu,
                        custom_white_list=self.custom_white_list,
                        custom_black_list=self.custom_black_list,
                        level=self.amp_level):
                    outs = self.model(data)
            else:
                outs = self.model(data)

            shift_amount = data['st_pix']
            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
            merged_bboxs.append(outs['bbox'])

            if data['is_last'] > 0:
                # merge matching predictions
                merged_results = {'bbox': []}
                if combine_method == 'nms':
                    final_boxes = multiclass_nms(
                        np.concatenate(merged_bboxs), self.cfg.num_classes,
                        match_threshold, match_metric)
                    merged_results['bbox'] = np.concatenate(final_boxes)
                elif combine_method == 'concat':
                    merged_results['bbox'] = np.concatenate(merged_bboxs)
                else:
                    raise ValueError(
                        "Now only support 'nms' or 'concat' to fuse detection results."
                    )
                merged_results['im_id'] = np.array([[0]])
                merged_results['bbox_num'] = np.array(
                    [len(merged_results['bbox'])])

                merged_bboxs = []
                data['im_id'] = data['ori_im_id']
                # update metrics
                for metric in self._metrics:
                    metric.update(data, merged_results)

                # multi-scale inputs: all inputs have same im_id
                if isinstance(data, typing.Sequence):
                    sample_num += data[0]['im_id'].numpy().shape[0]
                else:
                    sample_num += data['im_id'].numpy().shape[0]

            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        self._compose_callback.on_epoch_end(self.status)
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

    def evaluate_slice(self,
                       slice_size=[640, 640],
                       overlap_ratio=[0.25, 0.25],
                       combine_method='nms',
                       match_threshold=0.6,
                       match_metric='iou'):
        with paddle.no_grad():
            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
                                         combine_method, match_threshold,
                                         match_metric)

    def slice_predict(self,
                      images,
                      slice_size=[640, 640],
                      overlap_ratio=[0.25, 0.25],
                      combine_method='nms',
                      match_threshold=0.6,
                      match_metric='iou',
                      draw_threshold=0.5,
                      output_dir='output',
                      save_results=False,
                      visualize=True):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        self.dataset.set_slice_images(images, slice_size, overlap_ratio)
        loader = create('TestReader')(self.dataset, 0)
        imid2path = self.dataset.get_imid2path()

        def setup_metrics_for_loader():
            # mem
            metrics = copy.deepcopy(self._metrics)
            mode = self.mode
            save_prediction_only = self.cfg[
                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
            output_eval = self.cfg[
                'output_eval'] if 'output_eval' in self.cfg else None

            # modify
            self.mode = '_test'
            self.cfg['save_prediction_only'] = True
            self.cfg['output_eval'] = output_dir
            self.cfg['imid2path'] = imid2path
            self._init_metrics()

            # restore
            self.mode = mode
            self.cfg.pop('save_prediction_only')
            if save_prediction_only is not None:
                self.cfg['save_prediction_only'] = save_prediction_only

            self.cfg.pop('output_eval')
            if output_eval is not None:
                self.cfg['output_eval'] = output_eval

            self.cfg.pop('imid2path')

            _metrics = copy.deepcopy(self._metrics)
            self._metrics = metrics

            return _metrics

        if save_results:
            metrics = setup_metrics_for_loader()
        else:
            metrics = []

        anno_file = self.dataset.get_anno()
        clsid2catid, catid2name = get_categories(
            self.cfg.metric, anno_file=anno_file)

        # Run Infer
        self.status['mode'] = 'test'
        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('TestReader')(self.dataset, 0)
            self._flops(flops_loader)

        results = []  # all images
        merged_bboxs = []  # single image
        for step_id, data in enumerate(tqdm(loader)):
            self.status['step_id'] = step_id
            # forward
            outs = self.model(data)

            outs['bbox'] = outs['bbox'].numpy()  # only in test mode
            shift_amount = data['st_pix']
            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
            merged_bboxs.append(outs['bbox'])

            if data['is_last'] > 0:
                # merge matching predictions
                merged_results = {'bbox': []}
                if combine_method == 'nms':
                    final_boxes = multiclass_nms(
                        np.concatenate(merged_bboxs), self.cfg.num_classes,
                        match_threshold, match_metric)
                    merged_results['bbox'] = np.concatenate(final_boxes)
                elif combine_method == 'concat':
                    merged_results['bbox'] = np.concatenate(merged_bboxs)
                else:
                    raise ValueError(
                        "Now only support 'nms' or 'concat' to fuse detection results."
                    )
                merged_results['im_id'] = np.array([[0]])
                merged_results['bbox_num'] = np.array(
                    [len(merged_results['bbox'])])

                merged_bboxs = []
                data['im_id'] = data['ori_im_id']

                for _m in metrics:
                    _m.update(data, merged_results)

                for key in ['im_shape', 'scale_factor', 'im_id']:
                    if isinstance(data, typing.Sequence):
                        merged_results[key] = data[0][key]
                    else:
                        merged_results[key] = data[key]
                for key, value in merged_results.items():
                    if hasattr(value, 'numpy'):
                        merged_results[key] = value.numpy()
                results.append(merged_results)

        for _m in metrics:
            _m.accumulate()
            _m.reset()

        if visualize:
            for outs in results:
                batch_res = get_infer_results(outs, clsid2catid)
                bbox_num = outs['bbox_num']

                start = 0
                for i, im_id in enumerate(outs['im_id']):
                    image_path = imid2path[int(im_id)]
                    image = Image.open(image_path).convert('RGB')
                    image = ImageOps.exif_transpose(image)
                    self.status['original_image'] = np.array(image.copy())

                    end = start + bbox_num[i]
                    bbox_res = batch_res['bbox'][start:end] \
                            if 'bbox' in batch_res else None
                    mask_res = batch_res['mask'][start:end] \
                            if 'mask' in batch_res else None
                    segm_res = batch_res['segm'][start:end] \
                            if 'segm' in batch_res else None
                    keypoint_res = batch_res['keypoint'][start:end] \
                            if 'keypoint' in batch_res else None
                    pose3d_res = batch_res['pose3d'][start:end] \
                            if 'pose3d' in batch_res else None
                    image = visualize_results(
                        image, bbox_res, mask_res, segm_res, keypoint_res,
                        pose3d_res, int(im_id), catid2name, draw_threshold)
                    self.status['result_image'] = np.array(image.copy())
                    if self._compose_callback:
                        self._compose_callback.on_step_end(self.status)
                    # save image with detection
                    save_name = self._get_save_image_name(output_dir,
                                                          image_path)
                    logger.info("Detection bbox results save in {}".format(
                        save_name))
                    image.save(save_name, quality=95)

                    start = end

    def predict(self,
                images,
                draw_threshold=0.5,
                output_dir='output',
                save_results=False,
                visualize=True,
                save_threshold=0,
                do_eval=False):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if do_eval:
            save_threshold = 0.0
        self.dataset.set_images(images, do_eval=do_eval)
        loader = create('TestReader')(self.dataset, 0)

        imid2path = self.dataset.get_imid2path()

        def setup_metrics_for_loader():
            # mem
            metrics = copy.deepcopy(self._metrics)
            mode = self.mode
            save_prediction_only = self.cfg[
                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
            output_eval = self.cfg[
                'output_eval'] if 'output_eval' in self.cfg else None

            # modify
            self.mode = '_test'
            self.cfg['save_prediction_only'] = True
            self.cfg['output_eval'] = output_dir
            self.cfg['imid2path'] = imid2path
            self.cfg['save_threshold'] = save_threshold
            self._init_metrics()

            # restore
            self.mode = mode
            self.cfg.pop('save_prediction_only')
            if save_prediction_only is not None:
                self.cfg['save_prediction_only'] = save_prediction_only            

            self.cfg.pop('output_eval')
            if output_eval is not None:
                self.cfg['output_eval'] = output_eval

            self.cfg.pop('imid2path')

            _metrics = copy.deepcopy(self._metrics)
            self._metrics = metrics

            return _metrics

        if save_results:
            metrics = setup_metrics_for_loader()
        else:
            metrics = []

        anno_file = self.dataset.get_anno()
        clsid2catid, catid2name = get_categories(
            self.cfg.metric, anno_file=anno_file)

        # Run Infer
        self.status['mode'] = 'test'
        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('TestReader')(self.dataset, 0)
            self._flops(flops_loader)
        results = []
        for step_id, data in enumerate(tqdm(loader)):
            self.status['step_id'] = step_id
            # forward
            if hasattr(self.model, 'modelTeacher'):
                outs = self.model.modelTeacher(data)
            else:
                outs = self.model(data)
            for _m in metrics:
                _m.update(data, outs)

            for key in ['im_shape', 'scale_factor', 'im_id']:
                if isinstance(data, typing.Sequence):
                    outs[key] = data[0][key]
                else:
                    outs[key] = data[key]
            for key, value in outs.items():
                if hasattr(value, 'numpy'):
                    outs[key] = value.numpy()
            results.append(outs)

        # sniper
        if type(self.dataset) == SniperCOCODataSet:
            results = self.dataset.anno_cropper.aggregate_chips_detections(
                results)

        for _m in metrics:
            _m.accumulate()
            _m.reset()

        if visualize:
            for outs in results:
                batch_res = get_infer_results(outs, clsid2catid)
                bbox_num = outs['bbox_num']

                start = 0
                for i, im_id in enumerate(outs['im_id']):
                    image_path = imid2path[int(im_id)]
                    image = Image.open(image_path).convert('RGB')
                    image = ImageOps.exif_transpose(image)
                    self.status['original_image'] = np.array(image.copy())

                    end = start + bbox_num[i]
                    bbox_res = batch_res['bbox'][start:end] \
                            if 'bbox' in batch_res else None
                    mask_res = batch_res['mask'][start:end] \
                            if 'mask' in batch_res else None
                    segm_res = batch_res['segm'][start:end] \
                            if 'segm' in batch_res else None
                    keypoint_res = batch_res['keypoint'][start:end] \
                            if 'keypoint' in batch_res else None
                    pose3d_res = batch_res['pose3d'][start:end] \
                            if 'pose3d' in batch_res else None
                    image = visualize_results(
                        image, bbox_res, mask_res, segm_res, keypoint_res,
                        pose3d_res, int(im_id), catid2name, draw_threshold)
                    self.status['result_image'] = np.array(image.copy())
                    if self._compose_callback:
                        self._compose_callback.on_step_end(self.status)
                    # save image with detection
                    save_name = self._get_save_image_name(output_dir,
                                                          image_path)
                    logger.info("Detection bbox results save in {}".format(
                        save_name))
                    image.save(save_name, quality=95)

                    start = end
        return results

    def _get_save_image_name(self, output_dir, image_path):
        """
        Get save image name from source image path.
        """
        image_name = os.path.split(image_path)[-1]
        name, ext = os.path.splitext(image_name)
        return os.path.join(output_dir, "{}".format(name)) + ext

    def _get_infer_cfg_and_input_spec(self,
                                      save_dir,
                                      prune_input=True,
                                      kl_quant=False,
                                      yaml_name=None,
                                      model=None):
        if yaml_name is None:
            yaml_name = 'infer_cfg.yml'
        if model is None:
            model = self.model
        image_shape = None
        im_shape = [None, 2]
        scale_factor = [None, 2]
        if self.cfg.architecture in MOT_ARCH:
            test_reader_name = 'TestMOTReader'
        else:
            test_reader_name = 'TestReader'
        if 'inputs_def' in self.cfg[test_reader_name]:
            inputs_def = self.cfg[test_reader_name]['inputs_def']
            image_shape = inputs_def.get('image_shape', None)
        # set image_shape=[None, 3, -1, -1] as default
        if image_shape is None:
            image_shape = [None, 3, -1, -1]

        if len(image_shape) == 3:
            image_shape = [None] + image_shape
        else:
            im_shape = [image_shape[0], 2]
            scale_factor = [image_shape[0], 2]

        if hasattr(model, 'deploy'):
            model.deploy = True
        if 'slim' not in self.cfg:
            for layer in model.sublayers():
                if hasattr(layer, 'convert_to_deploy'):
                    layer.convert_to_deploy()

        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
                'export'] and self.cfg['export']['fuse_conv_bn']:
            model = fuse_conv_bn(model)

        export_post_process = self.cfg['export'].get(
            'post_process', False) if hasattr(self.cfg, 'export') else True
        export_nms = self.cfg['export'].get('nms', False) if hasattr(
            self.cfg, 'export') else True
        export_benchmark = self.cfg['export'].get(
            'benchmark', False) if hasattr(self.cfg, 'export') else False
        if hasattr(model, 'fuse_norm'):
            model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
                                                              False)
        if hasattr(model, 'export_post_process'):
            model.export_post_process = export_post_process if not export_benchmark else False
        if hasattr(model, 'export_nms'):
            model.export_nms = export_nms if not export_benchmark else False
        if export_post_process and not export_benchmark:
            image_shape = [None] + image_shape[1:]

        # Save infer cfg
        _dump_infer_config(self.cfg,
                           os.path.join(save_dir, yaml_name), image_shape,
                           model)

        input_spec = [{
            "image": InputSpec(
                shape=image_shape, name='image'),
            "im_shape": InputSpec(
                shape=im_shape, name='im_shape'),
            "scale_factor": InputSpec(
                shape=scale_factor, name='scale_factor')
        }]
        if self.cfg.architecture == 'DeepSORT':
            input_spec[0].update({
                "crops": InputSpec(
                    shape=[None, 3, 192, 64], name='crops')
            })

        if self.cfg.architecture == 'CLRNet':
            input_spec[0].update({
                "full_img_path": str,
                "img_name": str,
            })
        if prune_input:
            static_model = paddle.jit.to_static(
                model, input_spec=input_spec, full_graph=True)
            # NOTE: dy2st do not pruned program, but jit.save will prune program
            # input spec, prune input spec here and save with pruned input spec
            pruned_input_spec = _prune_input_spec(
                input_spec, static_model.forward.main_program,
                static_model.forward.outputs)
        else:
            static_model = None
            pruned_input_spec = input_spec

        # TODO: Hard code, delete it when support prune input_spec.
        if self.cfg.architecture == 'PicoDet' and not export_post_process:
            pruned_input_spec = [{
                "image": InputSpec(
                    shape=image_shape, name='image')
            }]
        if kl_quant:
            if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:
                pruned_input_spec = [{
                    "image": InputSpec(
                        shape=image_shape, name='image'),
                    "scale_factor": InputSpec(
                        shape=scale_factor, name='scale_factor')
                }]
            elif 'tinypose' in self.cfg.weights:
                pruned_input_spec = [{
                    "image": InputSpec(
                        shape=image_shape, name='image')
                }]

        return static_model, pruned_input_spec

    def export(self, output_dir='output_inference', for_fd=False):
        if hasattr(self.model, 'aux_neck'):
            self.model.__delattr__('aux_neck')
        if hasattr(self.model, 'aux_head'):
            self.model.__delattr__('aux_head')
        self.model.eval()
        model = copy.deepcopy(self.model)

        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
        if for_fd:
            save_dir = output_dir
            save_name = 'inference'
            yaml_name = 'inference.yml'
        else:
            save_dir = os.path.join(output_dir, model_name)
            save_name = 'model'
            yaml_name = None

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
            save_dir, yaml_name=yaml_name, model=model)

        # dy2st and save model
        if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:
            paddle.jit.save(
                static_model,
                os.path.join(save_dir, save_name),
                input_spec=pruned_input_spec)
        else:
            self.cfg.slim.save_quantized_model(
                self.model,
                os.path.join(save_dir, save_name),
                input_spec=pruned_input_spec)
        logger.info("Export model and saved in {}".format(save_dir))

    def post_quant(self, output_dir='output_inference'):
        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
        save_dir = os.path.join(output_dir, model_name)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx, data in enumerate(self.loader):
            self.model(data)
            if idx == int(self.cfg.get('quant_batch_num', 10)):
                break

        # TODO: support prune input_spec
        kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False
        _, pruned_input_spec = self._get_infer_cfg_and_input_spec(
            save_dir, prune_input=False, kl_quant=kl_quant)

        self.cfg.slim.save_quantized_model(
            self.model,
            os.path.join(save_dir, 'model'),
            input_spec=pruned_input_spec)
        logger.info("Export Post-Quant model and saved in {}".format(save_dir))

    def _flops(self, loader):
        if hasattr(self.model, 'aux_neck'):
            self.model.__delattr__('aux_neck')
        if hasattr(self.model, 'aux_head'):
            self.model.__delattr__('aux_head')
        self.model.eval()
        try:
            import paddleslim
        except Exception as e:
            logger.warning(
                'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'
            )
            return

        from paddleslim.analysis import dygraph_flops as flops
        input_data = None
        for data in loader:
            input_data = data
            break

        input_spec = [{
            "image": input_data['image'][0].unsqueeze(0),
            "im_shape": input_data['im_shape'][0].unsqueeze(0),
            "scale_factor": input_data['scale_factor'][0].unsqueeze(0)
        }]
        flops = flops(self.model, input_spec) / (1000**3)
        logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
            flops, input_data['image'][0].unsqueeze(0).shape))

    def parse_mot_images(self, cfg):
        import glob
        # for quant
        dataset_dir = cfg['EvalMOTDataset'].dataset_dir
        data_root = cfg['EvalMOTDataset'].data_root
        data_root = '{}/{}'.format(dataset_dir, data_root)
        seqs = os.listdir(data_root)
        seqs.sort()
        all_images = []
        for seq in seqs:
            infer_dir = os.path.join(data_root, seq)
            assert infer_dir is None or os.path.isdir(infer_dir), \
                "{} is not a directory".format(infer_dir)
            images = set()
            exts = ['jpg', 'jpeg', 'png', 'bmp']
            exts += [ext.upper() for ext in exts]
            for ext in exts:
                images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
            images = list(images)
            images.sort()
            assert len(images) > 0, "no image found in {}".format(infer_dir)
            all_images.extend(images)
            logger.info("Found {} inference images in total.".format(
                len(images)))
        return all_images

    def predict_culane(self,
                       images,
                       output_dir='output',
                       save_results=False,
                       visualize=True):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        self.dataset.set_images(images)
        loader = create('TestReader')(self.dataset, 0)

        imid2path = self.dataset.get_imid2path()

        def setup_metrics_for_loader():
            # mem
            metrics = copy.deepcopy(self._metrics)
            mode = self.mode
            save_prediction_only = self.cfg[
                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
            output_eval = self.cfg[
                'output_eval'] if 'output_eval' in self.cfg else None

            # modify
            self.mode = '_test'
            self.cfg['save_prediction_only'] = True
            self.cfg['output_eval'] = output_dir
            self.cfg['imid2path'] = imid2path
            self._init_metrics()

            # restore
            self.mode = mode
            self.cfg.pop('save_prediction_only')
            if save_prediction_only is not None:
                self.cfg['save_prediction_only'] = save_prediction_only

            self.cfg.pop('output_eval')
            if output_eval is not None:
                self.cfg['output_eval'] = output_eval

            self.cfg.pop('imid2path')

            _metrics = copy.deepcopy(self._metrics)
            self._metrics = metrics

            return _metrics

        if save_results:
            metrics = setup_metrics_for_loader()
        else:
            metrics = []

        # Run Infer
        self.status['mode'] = 'test'
        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('TestReader')(self.dataset, 0)
            self._flops(flops_loader)
        results = []
        for step_id, data in enumerate(tqdm(loader)):
            self.status['step_id'] = step_id
            # forward
            outs = self.model(data)

            for _m in metrics:
                _m.update(data, outs)

            for key in ['im_shape', 'scale_factor', 'im_id']:
                if isinstance(data, typing.Sequence):
                    outs[key] = data[0][key]
                else:
                    outs[key] = data[key]
            for key, value in outs.items():
                if hasattr(value, 'numpy'):
                    outs[key] = value.numpy()
            results.append(outs)

        for _m in metrics:
            _m.accumulate()
            _m.reset()

        if visualize:
            import cv2

            for outs in results:
                for i in range(len(outs['img_path'])):
                    lanes = outs['lanes'][i]
                    img_path = outs['img_path'][i]
                    img = cv2.imread(img_path)
                    out_file = os.path.join(output_dir,
                                            os.path.basename(img_path))
                    lanes = [
                        lane.to_array(
                            sample_y_range=[
                                self.cfg['sample_y']['start'],
                                self.cfg['sample_y']['end'],
                                self.cfg['sample_y']['step']
                            ],
                            img_w=self.cfg.ori_img_w,
                            img_h=self.cfg.ori_img_h) for lane in lanes
                    ]
                    imshow_lanes(img, lanes, out_file=out_file)

        return results

    def reset_norm_param_attr(self, layer, **kwargs):
        if isinstance(layer, (nn.BatchNorm2D, nn.LayerNorm, nn.GroupNorm)):
            src_state_dict = layer.state_dict()
            if isinstance(layer, nn.BatchNorm2D):
                layer = nn.BatchNorm2D(
                    num_features=layer._num_features,
                    momentum=layer._momentum,
                    epsilon=layer._epsilon,
                    **kwargs)
            elif isinstance(layer, nn.LayerNorm):
                layer = nn.LayerNorm(
                    normalized_shape=layer._normalized_shape,
                    epsilon=layer._epsilon,
                    **kwargs)
            else:
                layer = nn.GroupNorm(
                    num_groups=layer._num_groups,
                    num_channels=layer._num_channels,
                    epsilon=layer._epsilon,
                    **kwargs)
            layer.set_state_dict(src_state_dict)
        else:
            for name, sublayer in layer.named_children():
                new_sublayer = self.reset_norm_param_attr(sublayer, **kwargs)
                if new_sublayer is not sublayer:
                    setattr(layer, name, new_sublayer)

        return layer


================================================
FILE: ppdet/engine/trainer_cot.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ppdet.core.workspace import create
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')

from . import Trainer
__all__ = ['TrainerCot']

class TrainerCot(Trainer):
    """
    Trainer for label-cotuning
    calculate the relationship between base_classes and novel_classes
    """
    def __init__(self, cfg, mode='train'):
        super(TrainerCot, self).__init__(cfg, mode)
        self.cotuning_init()

    def cotuning_init(self):    
        num_classes_novel = self.cfg['num_classes']

        self.load_weights(self.cfg.pretrain_weights)

        self.model.eval()
        relationship = self.model.relationship_learning(self.loader, num_classes_novel)
    
        self.model.init_cot_head(relationship)
        self.optimizer = create('OptimizerBuilder')(self.lr, self.model)


================================================
FILE: ppdet/engine/trainer_ssod.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import copy
import time
import typing
import numpy as np

import paddle
import paddle.nn as nn
import paddle.distributed as dist
from paddle.distributed import fleet
from ppdet.optimizer import ModelEMA, SimpleModelEMA
from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model
import ppdet.utils.stats as stats
from ppdet.utils import profiler
from ppdet.modeling.ssod.utils import align_weak_strong_shape
from .trainer import Trainer
from ppdet.utils.logger import setup_logger
from paddle.static import InputSpec
from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']

logger = setup_logger('ppdet.engine')

__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR']


class Trainer_DenseTeacher(Trainer):
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
        self.optimizer = None
        self.is_loaded_weights = False
        self.use_amp = self.cfg.get('amp', False)
        self.amp_level = self.cfg.get('amp_level', 'O1')
        self.custom_white_list = self.cfg.get('custom_white_list', None)
        self.custom_black_list = self.cfg.get('custom_black_list', None)

        # build data loader
        capital_mode = self.mode.capitalize()
        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
            '{}Dataset'.format(capital_mode))()

        if self.mode == 'train':
            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
                'UnsupTrainDataset')
            self.loader = create('SemiTrainReader')(
                self.dataset, self.dataset_unlabel, cfg.worker_num)

        # build model
        if 'model' not in self.cfg:
            self.model = create(cfg.architecture)
        else:
            self.model = self.cfg.model
            self.is_loaded_weights = True

        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            self._eval_batch_sampler = paddle.io.BatchSampler(
                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
            # If metric is VOC, need to be set collate_batch=False.
            if cfg.metric == 'VOC':
                cfg['EvalReader']['collate_batch'] = False
            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
                                               self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = len(self.loader)
            if steps_per_epoch < 1:
                logger.warning(
                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
                )
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)

            # Unstructured pruner is only enabled in the train mode.
            if self.cfg.get('unstructured_prune'):
                self.pruner = create('UnstructuredPruner')(self.model,
                                                           steps_per_epoch)
        if self.use_amp and self.amp_level == 'O2':
            self.model, self.optimizer = paddle.amp.decorate(
                models=self.model,
                optimizers=self.optimizer,
                level=self.amp_level)

        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
        if self.use_ema:
            ema_decay = self.cfg.get('ema_decay', 0.9998)
            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
            cycle_epoch = self.cfg.get('cycle_epoch', -1)
            ema_black_list = self.cfg.get('ema_black_list', None)
            self.ema = ModelEMA(
                self.model,
                decay=ema_decay,
                ema_decay_type=ema_decay_type,
                cycle_epoch=cycle_epoch,
                ema_black_list=ema_black_list)
            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)

        # simple_ema for SSOD
        self.use_simple_ema = ('use_simple_ema' in cfg and
                               cfg['use_simple_ema'])
        if self.use_simple_ema:
            self.use_ema = True
            ema_decay = self.cfg.get('ema_decay', 0.9996)
            self.ema = SimpleModelEMA(self.model, decay=ema_decay)
            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)

        self._nranks = dist.get_world_size()
        self._local_rank = dist.get_rank()

        self.status = {}

        self.start_epoch = 0
        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()

    def load_weights(self, weights):
        if self.is_loaded_weights:
            return
        self.start_epoch = 0
        load_pretrain_weight(self.model, weights)
        load_pretrain_weight(self.ema.model, weights)
        logger.info("Load weights {} to start training for teacher and student".
                    format(weights))

    def resume_weights(self, weights, exchange=True):
        # support Distill resume weights
        if hasattr(self.model, 'student_model'):
            self.start_epoch = load_weight(self.model.student_model, weights,
                                           self.optimizer, exchange)
        else:
            self.start_epoch = load_weight(self.model, weights, self.optimizer,
                                           self.ema
                                           if self.use_ema else None, exchange)
        logger.debug("Resume weights of epoch {}".format(self.start_epoch))

    def train(self, validate=False):
        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
        Init_mark = False
        if validate:
            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
                "EvalDataset")()

        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
                   self.cfg.use_gpu and self._nranks > 1)
        if sync_bn:
            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
                self.model)

        if self.cfg.get('fleet', False):
            self.model = fleet.distributed_model(self.model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            self.model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)
            self.ema.model = paddle.DataParallel(
                self.ema.model, find_unused_parameters=find_unused_parameters)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader),
            'exchange_save_model': True,
        })
        # Note: exchange_save_model
        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
        profiler_options = self.cfg.get('profiler_options', None)
        self._compose_callback.on_train_begin(self.status)

        train_cfg = self.cfg.DenseTeacher['train_cfg']
        concat_sup_data = train_cfg.get('concat_sup_data', True)

        for param in self.ema.model.parameters():
            param.stop_gradient = True

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset_label.set_epoch(epoch_id)
            self.loader.dataset_unlabel.set_epoch(epoch_id)
            iter_tic = time.time()
            loss_dict = {
                'loss': paddle.to_tensor([0]),
                'loss_sup_sum': paddle.to_tensor([0]),
                'loss_unsup_sum': paddle.to_tensor([0]),
                'fg_sum': paddle.to_tensor([0]),
            }
            if self._nranks > 1:
                for k in self.model._layers.get_loss_keys():
                    loss_dict.update({k: paddle.to_tensor([0.])})
                for k in self.model._layers.get_loss_keys():
                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
            else:
                for k in self.model.get_loss_keys():
                    loss_dict.update({k: paddle.to_tensor([0.])})
                for k in self.model.get_loss_keys():
                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})

            # Note: for step_id, data in enumerate(self.loader): # enumerate bug
            for step_id in range(len(self.loader)):
                data = next(self.loader)

                self.model.train()
                self.ema.model.eval()
                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data

                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)

                if data_sup_w['image'].shape != data_sup_s['image'].shape:
                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
                                                                     data_sup_s)

                data_sup_w['epoch_id'] = epoch_id
                data_sup_s['epoch_id'] = epoch_id
                if concat_sup_data:
                    for k, v in data_sup_s.items():
                        if k in ['epoch_id']:
                            continue
                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
                    loss_dict_sup = self.model(data_sup_s)
                else:
                    loss_dict_sup_w = self.model(data_sup_w)
                    loss_dict_sup = self.model(data_sup_s)
                    for k, v in loss_dict_sup_w.items():
                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5

                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
                losses_sup.backward()

                losses = losses_sup.detach()
                loss_dict.update(loss_dict_sup)
                loss_dict.update({'loss_sup_sum': loss_dict['loss']})

                curr_iter = len(self.loader) * epoch_id + step_id
                st_iter = self.semi_start_iters
                if curr_iter == st_iter:
                    logger.info("***" * 30)
                    logger.info('Semi starting ...')
                    logger.info("***" * 30)
                if curr_iter > st_iter:
                    unsup_weight = train_cfg['unsup_weight']
                    if train_cfg['suppress'] == 'linear':
                        tar_iter = st_iter * 2
                        if curr_iter <= tar_iter:
                            unsup_weight *= (curr_iter - st_iter) / st_iter
                    elif train_cfg['suppress'] == 'exp':
                        tar_iter = st_iter + 2000
                        if curr_iter <= tar_iter:
                            scale = np.exp((curr_iter - tar_iter) / 1000)
                            unsup_weight *= scale
                    elif train_cfg['suppress'] == 'step':
                        tar_iter = st_iter * 2
                        if curr_iter <= tar_iter:
                            unsup_weight *= 0.25
                    else:
                        raise ValueError

                    if data_unsup_w['image'].shape != data_unsup_s[
                            'image'].shape:
                        data_unsup_w, data_unsup_s = align_weak_strong_shape(
                            data_unsup_w, data_unsup_s)

                    data_unsup_w['epoch_id'] = epoch_id
                    data_unsup_s['epoch_id'] = epoch_id

                    data_unsup_s['get_data'] = True
                    student_preds = self.model(data_unsup_s)

                    with paddle.no_grad():
                        data_unsup_w['is_teacher'] = True
                        teacher_preds = self.ema.model(data_unsup_w)

                    train_cfg['curr_iter'] = curr_iter
                    train_cfg['st_iter'] = st_iter
                    if self._nranks > 1:
                        loss_dict_unsup = self.model._layers.get_ssod_loss(
                            student_preds, teacher_preds, train_cfg)
                    else:
                        loss_dict_unsup = self.model.get_ssod_loss(
                            student_preds, teacher_preds, train_cfg)

                    fg_num = loss_dict_unsup["fg_sum"]
                    del loss_dict_unsup["fg_sum"]
                    distill_weights = train_cfg['loss_weight']
                    loss_dict_unsup = {
                        k: v * distill_weights[k]
                        for k, v in loss_dict_unsup.items()
                    }

                    losses_unsup = sum([
                        metrics_value
                        for metrics_value in loss_dict_unsup.values()
                    ]) * unsup_weight
                    losses_unsup.backward()

                    loss_dict.update(loss_dict_unsup)
                    loss_dict.update({'loss_unsup_sum': losses_unsup})
                    losses += losses_unsup.detach()
                    loss_dict.update({"fg_sum": fg_num})
                    loss_dict['loss'] = losses

                self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr
                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(loss_dict)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                # Note: ema_start_iters
                if self.use_ema and curr_iter == self.ema_start_iters:
                    logger.info("***" * 30)
                    logger.info('EMA starting ...')
                    logger.info("***" * 30)
                    self.ema.update(self.model, decay=0)
                elif self.use_ema and curr_iter > self.ema_start_iters:
                    self.ema.update(self.model)
                iter_tic = time.time()

            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
            if is_snapshot and self.use_ema:
                # apply ema weight on model
                weight = copy.deepcopy(self.ema.model.state_dict())
                for k, v in weight.items():
                    if paddle.is_floating_point(v):
                        weight[k].stop_gradient = True
                self.status['weight'] = weight

            self._compose_callback.on_epoch_end(self.status)

            if validate and is_snapshot:
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    # If metric is VOC, need to be set collate_batch=False.
                    if self.cfg.metric == 'VOC':
                        self.cfg['EvalReader']['collate_batch'] = False
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                # if validation in training is enabled, metrics should be re-init
                # Init_mark makes sure this code will only execute once
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()

                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            if is_snapshot and self.use_ema:
                self.status.pop('weight')

        self._compose_callback.on_train_end(self.status)

    def evaluate(self):
        # get distributed model
        if self.cfg.get('fleet', False):
            self.model = fleet.distributed_model(self.model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            self.model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)
        with paddle.no_grad():
            self._eval_with_loader(self.loader)

    def _eval_with_loader(self, loader):
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
        self.status['mode'] = 'eval'

        test_cfg = self.cfg.DenseTeacher['test_cfg']
        if test_cfg['inference_on'] == 'teacher':
            logger.info("***** teacher model evaluating *****")
            eval_model = self.ema.model
        else:
            logger.info("***** student model evaluating *****")
            eval_model = self.model

        eval_model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
            self._flops(flops_loader)
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            if self.use_amp:
                with paddle.amp.auto_cast(
                        enable=self.cfg.use_gpu or self.cfg.use_mlu,
                        custom_white_list=self.custom_white_list,
                        custom_black_list=self.custom_black_list,
                        level=self.amp_level):
                    outs = eval_model(data)
            else:
                outs = eval_model(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            # multi-scale inputs: all inputs have same im_id
            if isinstance(data, typing.Sequence):
                sample_num += data[0]['im_id'].numpy().shape[0]
            else:
                sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        self._compose_callback.on_epoch_end(self.status)
        self._reset_metrics()


class Trainer_ARSL(Trainer):
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
        self.optimizer = None
        self.is_loaded_weights = False
        capital_mode = self.mode.capitalize()
        self.use_ema = False
        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
            '{}Dataset'.format(capital_mode))()
        if self.mode == 'train':
            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
                'UnsupTrainDataset')
            self.loader = create('SemiTrainReader')(
                self.dataset, self.dataset_unlabel, cfg.worker_num)

        # build model
        if 'model' not in self.cfg:
            self.student_model = create(cfg.architecture)
            self.teacher_model = create(cfg.architecture)
            self.model = EnsembleTSModel(self.teacher_model, self.student_model)
        else:
            self.model = self.cfg.model
            self.is_loaded_weights = True
        # save path for burn-in model
        self.base_path = cfg.get('weights')
        self.base_path = os.path.dirname(self.base_path)

        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            self._eval_batch_sampler = paddle.io.BatchSampler(
                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, cfg.worker_num, self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here

        self.start_epoch = 0
        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
        self.epoch_iter = self.cfg.epoch_iter  # set fixed iter in each epoch to control checkpoint

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = self.epoch_iter
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(self.lr,
                                                        self.model.modelStudent)

        self._nranks = dist.get_world_size()
        self._local_rank = dist.get_rank()

        self.status = {}

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()
        self.iter = 0

    def resume_weights(self, weights):
        # support Distill resume weights
        if hasattr(self.model, 'student_model'):
            self.start_epoch = load_weight(self.model.student_model, weights,
                                           self.optimizer)
        else:
            self.start_epoch = load_weight(self.model, weights, self.optimizer)
        logger.debug("Resume weights of epoch {}".format(self.start_epoch))

    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False

        # if validation in training is enabled, metrics should be re-init
        if validate:
            self._init_metrics(validate=validate)
            self._reset_metrics()

        if self.cfg.get('fleet', False):
            self.model.modelStudent = fleet.distributed_model(
                self.model.modelStudent)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            self.model.modelStudent = paddle.DataParallel(
                self.model.modelStudent,
                find_unused_parameters=find_unused_parameters)

        # set fixed iter in each epoch to control checkpoint
        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': self.epoch_iter
        })
        print('338 Len of DataLoader: {}'.format(len(self.loader)))

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        self._compose_callback.on_train_begin(self.status)

        epoch_id = self.start_epoch
        self.iter = self.start_epoch * self.epoch_iter
        # use iter rather than epoch to control training schedule
        while self.iter < self.cfg.max_iter:
            # epoch loop
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset_label.set_epoch(epoch_id)
            self.loader.dataset_unlabel.set_epoch(epoch_id)
            paddle.device.cuda.empty_cache()  # clear GPU memory
            # set model status
            self.model.modelStudent.train()
            self.model.modelTeacher.eval()
            iter_tic = time.time()

            # iter loop in each eopch
            for step_id in range(self.epoch_iter):
                data = next(self.loader)
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                # profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)

                # model forward and calculate loss
                loss_dict = self.run_step_full_semisup(data)

                if (step_id + 1) % self.cfg.optimize_rate == 0:
                    self.optimizer.step()
                    self.optimizer.clear_grad()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()

                # update log status
                self.status['learning_rate'] = curr_lr
                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(loss_dict)
                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                self.iter += 1
                iter_tic = time.time()

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()
                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    # before burn-in stage, eval student. after burn-in stage, eval teacher
                    if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']:
                        print("start eval student model")
                        self._eval_with_loader(
                            self._eval_loader, mode="student")
                    else:
                        print("start eval teacher model")
                        self._eval_with_loader(
                            self._eval_loader, mode="teacher")

            epoch_id += 1

        self._compose_callback.on_train_end(self.status)

    def merge_data(self, data1, data2):
        data = copy.deepcopy(data1)
        for k, v in data1.items():
            if type(v) is paddle.Tensor:
                data[k] = paddle.concat(x=[data[k], data2[k]], axis=0)
            elif type(v) is list:
                data[k].extend(data2[k])
        return data

    def run_step_full_semisup(self, data):
        label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data
        data_merge = self.merge_data(label_data_k, label_data_q)
        loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised")
        loss_dict = {}
        for key in loss_sup_dict.keys():
            if key[:4] == "loss":
                loss_dict[key] = loss_sup_dict[key] * 1
        losses_sup = paddle.add_n(list(loss_dict.values()))
        # norm loss when using gradient accumulation
        losses_sup = losses_sup / self.cfg.optimize_rate
        losses_sup.backward()

        for key in loss_sup_dict.keys():
            loss_dict[key + "_pseudo"] = paddle.to_tensor([0])
        loss_dict["loss_tot"] = losses_sup
        """
        semi-supervised training after burn-in stage
        """
        if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']:
            # init teacher model with burn-up weight
            if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']:
                print(
                    'Starting semi-supervised learning and load the teacher model.'
                )
                self._update_teacher_model(keep_rate=0.00)
                # save burn-in model
                if dist.get_world_size() < 2 or dist.get_rank() == 0:
                    print('saving burn-in model.')
                    save_name = 'burnIn'
                    epoch_id = self.iter // self.epoch_iter
                    save_model(self.model, self.optimizer, self.base_path,
                               save_name, epoch_id)
            # Update teacher model with EMA
            elif (self.iter + 1) % self.cfg.optimize_rate == 0:
                self._update_teacher_model(
                    keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE'])

            #warm-up weight for pseudo loss
            pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT']
            pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS']
            temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP']
            if temp <= pseudo_warmup_iter:
                pseudo_weight *= (temp / pseudo_warmup_iter)

            # get teacher predictions on weak-augmented unlabeled data
            with paddle.no_grad():
                teacher_pred = self.model.modelTeacher(
                    unlabel_data_k, branch='semi_supervised')

            # calculate unsupervised loss on strong-augmented unlabeled data
            loss_unsup_dict = self.model.modelStudent(
                unlabel_data_q,
                branch="semi_supervised",
                teacher_prediction=teacher_pred, )

            for key in loss_unsup_dict.keys():
                if key[-6:] == "pseudo":
                    loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight
            losses_unsup = paddle.add_n(list(loss_unsup_dict.values()))
            # norm loss when using gradient accumulation
            losses_unsup = losses_unsup / self.cfg.optimize_rate
            losses_unsup.backward()

            loss_dict.update(loss_unsup_dict)
            loss_dict["loss_tot"] += losses_unsup
        return loss_dict

    def export(self, output_dir='output_inference'):
        self.model.eval()
        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
        save_dir = os.path.join(output_dir, model_name)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        image_shape = None
        if self.cfg.architecture in MOT_ARCH:
            test_reader_name = 'TestMOTReader'
        else:
            test_reader_name = 'TestReader'
        if 'inputs_def' in self.cfg[test_reader_name]:
            inputs_def = self.cfg[test_reader_name]['inputs_def']
            image_shape = inputs_def.get('image_shape', None)
        # set image_shape=[3, -1, -1] as default
        if image_shape is None:
            image_shape = [3, -1, -1]

        self.model.modelTeacher.eval()
        if hasattr(self.model.modelTeacher, 'deploy'):
            self.model.modelTeacher.deploy = True

        # Save infer cfg
        _dump_infer_config(self.cfg,
                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
                           self.model.modelTeacher)

        input_spec = [{
            "image": InputSpec(
                shape=[None] + image_shape, name='image'),
            "im_shape": InputSpec(
                shape=[None, 2], name='im_shape'),
            "scale_factor": InputSpec(
                shape=[None, 2], name='scale_factor')
        }]
        if self.cfg.architecture == 'DeepSORT':
            input_spec[0].update({
                "crops": InputSpec(
                    shape=[None, 3, 192, 64], name='crops')
            })

        static_model = paddle.jit.to_static(
            self.model.modelTeacher, input_spec=input_spec)
        # NOTE: dy2st do not pruned program, but jit.save will prune program
        # input spec, prune input spec here and save with pruned input spec
        pruned_input_spec = _prune_input_spec(input_spec,
                                              static_model.forward.main_program,
                                              static_model.forward.outputs)

        # dy2st and save model
        if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
            paddle.jit.save(
                static_model,
                os.path.join(save_dir, 'model'),
                input_spec=pruned_input_spec)
        else:
            self.cfg.slim.save_quantized_model(
                self.model.modelTeacher,
                os.path.join(save_dir, 'model'),
                input_spec=pruned_input_spec)
        logger.info("Export model and saved in {}".format(save_dir))

    def _eval_with_loader(self, loader, mode="teacher"):
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
        self.status['mode'] = 'eval'
        # self.model.eval()
        self.model.modelTeacher.eval()
        self.model.modelStudent.eval()
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            if mode == "teacher":
                outs = self.model.modelTeacher(data)
            else:
                outs = self.model.modelStudent(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        self._compose_callback.on_epoch_end(self.status)
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

    def evaluate(self):
        with paddle.no_grad():
            self._eval_with_loader(self.loader)

    @paddle.no_grad()
    def _update_teacher_model(self, keep_rate=0.996):
        student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict())
        new_teacher_dict = dict()
        for key, value in self.model.modelTeacher.state_dict().items():
            if key in student_model_dict.keys():
                v = student_model_dict[key] * (1 - keep_rate
                                               ) + value * keep_rate
                v.stop_gradient = True
                new_teacher_dict[key] = v
            else:
                raise Exception("{} is not found in student model".format(key))

        self.model.modelTeacher.set_dict(new_teacher_dict)


class EnsembleTSModel(nn.Layer):
    def __init__(self, modelTeacher, modelStudent):
        super(EnsembleTSModel, self).__init__()
        self.modelTeacher = modelTeacher
        self.modelStudent = modelStudent


class Trainer_Semi_RTDETR(Trainer):
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
        self.optimizer = None
        self.is_loaded_weights = False
        self.use_amp = self.cfg.get('amp', False)
        self.amp_level = self.cfg.get('amp_level', 'O1')
        self.custom_white_list = self.cfg.get('custom_white_list', None)
        self.custom_black_list = self.cfg.get('custom_black_list', None)

        # build data loader
        capital_mode = self.mode.capitalize()
        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
            '{}Dataset'.format(capital_mode))()

        if self.mode == 'train':
            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
                'UnsupTrainDataset')
            self.loader = create('SemiTrainReader')(
                self.dataset, self.dataset_unlabel, cfg.worker_num)

        # build model
        if 'model' not in self.cfg:
            self.model = create(cfg.SSOD)
        else:
            self.model = self.cfg.model
            self.is_loaded_weights = True

        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            self._eval_batch_sampler = paddle.io.BatchSampler(
                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
            # If metric is VOC, need to be set collate_batch=False.
            if cfg.metric == 'VOC':
                cfg['EvalReader']['collate_batch'] = False
            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
                                               self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = len(self.loader)
            if steps_per_epoch < 1:
                logger.warning(
                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
                )
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)

            # Unstructured pruner is only enabled in the train mode.
            if self.cfg.get('unstructured_prune'):
                self.pruner = create('UnstructuredPruner')(self.model,
                                                           steps_per_epoch)
        if self.use_amp and self.amp_level == 'O2':
            self.model, self.optimizer = paddle.amp.decorate(
                models=self.model,
                optimizers=self.optimizer,
                level=self.amp_level)

        self._nranks = dist.get_world_size()
        self._local_rank = dist.get_rank()

        self.status = {}

        self.start_epoch = 0
        self.start_iter = 0
        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()

    def load_semi_weights(self, t_weights, s_weights):
        if self.is_loaded_weights:
            return
        self.start_epoch = 0
        load_pretrain_weight(self.model.teacher, t_weights)
        load_pretrain_weight(self.model.student, s_weights)
        logger.info("Load teacher weights {} to start training".format(
            t_weights))
        logger.info("Load student weights {} to start training".format(
            s_weights))

    def resume_weights(self, weights, exchange=True):
        # support Distill resume weights
        if hasattr(self.model, 'student_model'):
            self.start_epoch = load_weight(self.model.student_model, weights,
                                           self.optimizer, exchange)
        else:
            self.start_iter, self.start_epoch = load_weight(
                self.model, weights, self.optimizer, self.ema
                if self.use_ema else None, exchange)
        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
        logger.debug("Resume weights of iter {}".format(self.start_iter))

    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False
        if validate:
            self.cfg.EvalDataset = create("EvalDataset")()

        model = self.model
        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
                   self.cfg.use_gpu and self._nranks > 1)
        if sync_bn:
            # self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
            #     self.model)
            model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
                model.teacher)
            model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
                self.model.student)

        if self.cfg.get('fleet', False):
            # model = fleet.distributed_model(model)
            model = fleet.distributed_model(model)

            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                model, find_unused_parameters=find_unused_parameters)

        if self.cfg.get('amp', False):
            scaler = amp.GradScaler(
                enable=self.cfg.use_gpu or self.cfg.use_npu,
                init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'iter_id': self.start_iter,
            # 'step_id': self.start_step,
            'steps_per_epoch': len(self.loader),
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num)
            self._flops(flops_loader)
        profiler_options = self.cfg.get('profiler_options', None)

        self._compose_callback.on_train_begin(self.status)
        iter_id = self.start_iter
        self.status['iter_id'] = iter_id
        self.status['eval_interval'] = self.cfg.eval_interval
        self.status['save_interval'] = self.cfg.save_interval
        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset_label.set_epoch(epoch_id)
            self.loader.dataset_unlabel.set_epoch(epoch_id)
            iter_tic = time.time()
            if self._nranks > 1:
                # print(model)
                model._layers.teacher.eval()
                model._layers.student.train()
            else:
                model.teacher.eval()
                model.student.train()
            iter_tic = time.time()
            for step_id in range(len(self.loader)):
                data = next(self.loader)
                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
                data_sup_w['epoch_id'] = epoch_id
                data_sup_s['epoch_id'] = epoch_id
                data_unsup_w['epoch_id'] = epoch_id
                data_unsup_s['epoch_id'] = epoch_id
                data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s]
                iter_id += 1
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self.status['iter_id'] = iter_id
                data.append(iter_id)
                profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)
                if self.cfg.get('amp', False):
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        if self._nranks > 1:
                            outputs = model._layers(data)
                        else:
                            outputs = model(data)
                        loss = outputs['loss']

                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                if self.cfg.get('unstructured_prune'):
                    self.pruner.step()
                self.optimizer.clear_grad()
                # print(outputs)
                # outputs=reduce_dict(outputs)
                # if self.model.debug:
                #     check_gradient(model)
                # self.check_gradient()
                self.status['learning_rate'] = curr_lr
                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)

                if validate and (self._nranks < 2 or self._local_rank == 0) and \
                                ((iter_id + 1) % self.cfg.eval_interval == 0):
                    if not hasattr(self, '_eval_loader'):
                        # build evaluation dataset and loader
                        self._eval_dataset = self.cfg.EvalDataset
                        self._eval_batch_sampler = \
                            paddle.io.BatchSampler(
                                self._eval_dataset,
                                batch_size=self.cfg.EvalReader['batch_size'])
                        # If metric is VOC, need to be set collate_batch=False.
                        if self.cfg.metric == 'VOC':
                            self.cfg['EvalReader']['collate_batch'] = False
                        self._eval_loader = create('EvalReader')(
                            self._eval_dataset,
                            self.cfg.worker_num,
                            batch_sampler=self._eval_batch_sampler)
                    # if validation in training is enabled, metrics should be re-init
                    # Init_mark makes sure this code will only execute once
                    if validate and Init_mark == False:
                        Init_mark = True
                        self._init_metrics(validate=validate)
                        self._reset_metrics()

                    with paddle.no_grad():
                        self.status['save_best_model'] = True
                        self._eval_with_loader(self._eval_loader)
                    model._layers.student.train()

                self._compose_callback.on_step_end(self.status)

                iter_tic = time.time()

            if self.cfg.get('unstructured_prune'):
                self.pruner.update_params()
            self._compose_callback.on_epoch_end(self.status)

        self._compose_callback.on_train_end(self.status)

    def _eval_with_loader(self, loader):
        sample_num = 0
        tic = time.time()
        self._compose_callback.on_epoch_begin(self.status)
        self.status['mode'] = 'eval'
        self.model.eval()
        if self.cfg.get('print_flops', False):
            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
            self._flops(flops_loader)
        print("*****teacher evaluate*****")
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            outs = self.model.teacher(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            # multi-scale inputs: all inputs have same im_id
            if isinstance(data, typing.Sequence):
                sample_num += data[0]['im_id'].numpy().shape[0]
            else:
                sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        self._compose_callback.on_epoch_end(self.status)
        # reset metric states for metric may performed multiple times
        self._reset_metrics()

        print("*****student evaluate*****")
        for step_id, data in enumerate(loader):
            self.status['step_id'] = step_id
            self._compose_callback.on_step_begin(self.status)
            # forward
            outs = self.model.student(data)

            # update metrics
            for metric in self._metrics:
                metric.update(data, outs)

            # multi-scale inputs: all inputs have same im_id
            if isinstance(data, typing.Sequence):
                sample_num += data[0]['im_id'].numpy().shape[0]
            else:
                sample_num += data['im_id'].numpy().shape[0]
            self._compose_callback.on_step_end(self.status)

        self.status['sample_num'] = sample_num
        self.status['cost_time'] = time.time() - tic

        # accumulate metric to log out
        for metric in self._metrics:
            metric.accumulate()
            metric.log()
        # reset metric states for metric may performed multiple times
        self._reset_metrics()
        self.status['mode'] = 'train'

    def evaluate(self):
        with paddle.no_grad():
            self._eval_with_loader(self.loader)


================================================
FILE: ppdet/ext_op/README.md
================================================
# 自定义OP编译
旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。

## 1. 环境依赖
- Paddle >= 2.0.1
- gcc 8.2

## 2. 安装
```
python setup.py install
```

编译完成后即可使用，以下为`rbox_iou`的使用示例
```
# 引入自定义op
from ext_op import rbox_iou

paddle.set_device('gpu:0')
paddle.disable_static()

rbox1 = np.random.rand(13000, 5)
rbox2 = np.random.rand(7, 5)

pd_rbox1 = paddle.to_tensor(rbox1)
pd_rbox2 = paddle.to_tensor(rbox2)

iou = rbox_iou(pd_rbox1, pd_rbox2)
print('iou', iou)
```

## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
```
python unittest/test_matched_rbox_iou.py
```


================================================
FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
================================================
//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/

#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"

template <typename T>
void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
                                 const T *rbox2_data_ptr, T *output_data_ptr) {

  int i;
  for (i = 0; i < rbox_num; i++) {
    output_data_ptr[i] =
        rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);
  }
}

#define CHECK_INPUT_CPU(x)                                                     \
  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")

std::vector<paddle::Tensor>
MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
                         const paddle::Tensor &rbox2) {
  CHECK_INPUT_CPU(rbox1);
  CHECK_INPUT_CPU(rbox2);
  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");

  auto rbox_num = rbox1.shape()[0];
  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());

  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
                               matched_rbox_iou_cpu_kernel<data_t>(
                                   rbox_num, rbox1.data<data_t>(),
                                   rbox2.data<data_t>(), output.data<data_t>());
                             }));

  return {output};
}

#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor>
MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
                          const paddle::Tensor &rbox2);
#endif

#define CHECK_INPUT_SAME(x1, x2)                                               \
  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")

std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
                                                  const paddle::Tensor &rbox2) {
  CHECK_INPUT_SAME(rbox1, rbox2);
  if (rbox1.is_cpu()) {
    return MatchedRboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
  } else if (rbox1.is_gpu()) {
    return MatchedRboxIouCUDAForward(rbox1, rbox2);
#endif
  }
}

std::vector<std::vector<int64_t>>
MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
                         std::vector<int64_t> rbox2_shape) {
  return {{rbox1_shape[0]}};
}

std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
                                                       paddle::DataType t2) {
  return {t1};
}

PD_BUILD_OP(matched_rbox_iou)
    .Inputs({"RBOX1", "RBOX2"})
    .Outputs({"Output"})
    .SetKernelFn(PD_KERNEL(MatchedRboxIouForward))
    .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));


================================================
FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
================================================
//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/

#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"

template <typename T>
__global__ void
matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
                             const T *rbox2_data_ptr, T *output_data_ptr) {
  for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
       tid += blockDim.x * gridDim.x) {
    output_data_ptr[tid] =
        rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);
  }
}

#define CHECK_INPUT_GPU(x)                                                     \
  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")

std::vector<paddle::Tensor>
MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
                          const paddle::Tensor &rbox2) {
  CHECK_INPUT_GPU(rbox1);
  CHECK_INPUT_GPU(rbox2);
  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");

  auto rbox_num = rbox1.shape()[0];

  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());

  const int thread_per_block = 512;
  const int block_per_grid = CeilDiv(rbox_num, thread_per_block);

  PD_DISPATCH_FLOATING_TYPES(
      rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] {
        matched_rbox_iou_cuda_kernel<
            data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
            rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
            output.data<data_t>());
      }));

  return {output};
}


================================================
FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
================================================
//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"

template <typename T>
void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
                            const int64_t num_boxes, int64_t *num_keep_boxes,
                            int64_t *output_data) {

  int num_masks = CeilDiv(num_boxes, 64);
  std::vector<int64_t> masks(num_masks, 0);
  for (int64_t i = 0; i < num_boxes; ++i) {
    if (masks[i / 64] & 1ULL << (i % 64))
      continue;
    T box_1[5];
    for (int k = 0; k < 5; ++k) {
      box_1[k] = boxes_data[i * 5 + k];
    }
    for (int64_t j = i + 1; j < num_boxes; ++j) {
      if (masks[j / 64] & 1ULL << (j % 64))
        continue;
      T box_2[5];
      for (int k = 0; k < 5; ++k) {
        box_2[k] = boxes_data[j * 5 + k];
      }
      if (rbox_iou_single<T>(box_1, box_2) > threshold) {
        masks[j / 64] |= 1ULL << (j % 64);
      }
    }
  }
  int64_t output_data_idx = 0;
  for (int64_t i = 0; i < num_boxes; ++i) {
    if (masks[i / 64] & 1ULL << (i % 64))
      continue;
    output_data[output_data_idx++] = i;
  }
  *num_keep_boxes = output_data_idx;
  for (; output_data_idx < num_boxes; ++output_data_idx) {
    output_data[output_data_idx] = 0;
  }
}

#define CHECK_INPUT_CPU(x)                                                     \
  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")

std::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,
                                                 const paddle::Tensor &scores,
                                                 float threshold) {
  CHECK_INPUT_CPU(boxes);
  CHECK_INPUT_CPU(scores);

  auto num_boxes = boxes.shape()[0];

  auto order_t =
      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);

  auto keep =
      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
  int64_t num_keep_boxes = 0;

  PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
                               nms_rotated_cpu_kernel<data_t>(
                                   boxes_sorted.data<data_t>(), threshold,
                                   num_boxes, &num_keep_boxes,
                                   keep.data<int64_t>());
                             }));

  keep = keep.slice(0, num_keep_boxes);
  return {paddle::gather(order_t, keep, /* axis=*/0)};
}

#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
                                                  const paddle::Tensor &scores,
                                                  float threshold);
#endif

std::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,
                                              const paddle::Tensor &scores,
                                              float threshold) {
  if (boxes.is_cpu()) {
    return NMSRotatedCPUForward(boxes, scores, threshold);
#ifdef PADDLE_WITH_CUDA
  } else if (boxes.is_gpu()) {
    return NMSRotatedCUDAForward(boxes, scores, threshold);
#endif
  }
}

std::vector<std::vector<int64_t>>
NMSRotatedInferShape(std::vector<int64_t> boxes_shape,
                     std::vector<int64_t> scores_shape) {
  return {{-1}};
}

std::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,
                                                   paddle::DataType t2) {
  return {paddle::DataType::INT64};
}

PD_BUILD_OP(nms_rotated)
    .Inputs({"Boxes", "Scores"})
    .Outputs({"Output"})
    .Attrs({"threshold: float"})
    .SetKernelFn(PD_KERNEL(NMSRotatedForward))
    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));

================================================
FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
================================================
//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"

static const int64_t threadsPerBlock = sizeof(int64_t) * 8;

template <typename T>
__global__ void
nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
                        const int64_t num_boxes, int64_t *masks) {
  auto raw_start = blockIdx.y;
  auto col_start = blockIdx.x;
  if (raw_start > col_start)
    return;
  const int raw_last_storage =
      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
  const int col_last_storage =
      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
  if (threadIdx.x < raw_last_storage) {
    int64_t mask = 0;
    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
    const T *current_box = boxes_data + current_box_idx * 5;
    for (int i = 0; i < col_last_storage; ++i) {
      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
      if (rbox_iou_single<T>(current_box, target_box) > threshold) {
        mask |= 1ULL << i;
      }
    }
    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
    masks[current_box_idx * blocks_per_line + col_start] = mask;
  }
}

#define CHECK_INPUT_GPU(x)                                                     \
  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")

std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
                                                  const paddle::Tensor &scores,
                                                  float threshold) {
  CHECK_INPUT_GPU(boxes);
  CHECK_INPUT_GPU(scores);

  auto num_boxes = boxes.shape()[0];
  auto order_t =
      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);

  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
  dim3 block(threadsPerBlock);
  dim3 grid(blocks_per_line, blocks_per_line);
  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
                                paddle::DataType::INT64, paddle::GPUPlace());

  PD_DISPATCH_FLOATING_TYPES(
      boxes.type(), "nms_rotated_cuda_kernel", ([&] {
        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(
            boxes_sorted.data<data_t>(), threshold, num_boxes,
            mask_dev.data<int64_t>());
      }));

  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
  auto keep_host =
      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
  int64_t *keep_host_ptr = keep_host.data<int64_t>();
  int64_t *mask_host_ptr = mask_host.data<int64_t>();
  std::vector<int64_t> remv(blocks_per_line);
  int64_t last_box_num = 0;
  for (int64_t i = 0; i < num_boxes; ++i) {
    auto remv_element_id = i / threadsPerBlock;
    auto remv_bit_id = i % threadsPerBlock;
    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
      keep_host_ptr[last_box_num++] = i;
      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
        remv[j] |= current_mask[j];
      }
    }
  }

  keep_host = keep_host.slice(0, last_box_num);
  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
}

================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
================================================
//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/

#include "paddle/extension.h"
#include "rbox_iou_utils.h"

template <typename T>
void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,
                         T *output_data_ptr) {

  int i, j;
  for (i = 0; i < rbox1_num; i++) {
    for (j = 0; j < rbox2_num; j++) {
      int offset = i * rbox2_num + j;
      output_data_ptr[offset] =
          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
    }
  }
}

#define CHECK_INPUT_CPU(x)                                                     \
  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")

std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,
                                              const paddle::Tensor &rbox2) {
  CHECK_INPUT_CPU(rbox1);
  CHECK_INPUT_CPU(rbox2);

  auto rbox1_num = rbox1.shape()[0];
  auto rbox2_num = rbox2.shape()[0];

  auto output =
      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());

  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
                               rbox_iou_cpu_kernel<data_t>(
                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),
                                   rbox2.data<data_t>(), output.data<data_t>());
                             }));

  return {output};
}

#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
                                               const paddle::Tensor &rbox2);
#endif

#define CHECK_INPUT_SAME(x1, x2)                                               \
  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")

std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,
                                           const paddle::Tensor &rbox2) {
  CHECK_INPUT_SAME(rbox1, rbox2);
  if (rbox1.is_cpu()) {
    return RboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
  } else if (rbox1.is_gpu()) {
    return RboxIouCUDAForward(rbox1, rbox2);
#endif
  }
}

std::vector<std::vector<int64_t>>
RboxIouInferShape(std::vector<int64_t> rbox1_shape,
                  std::vector<int64_t> rbox2_shape) {
  return {{rbox1_shape[0], rbox2_shape[0]}};
}

std::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,
                                                paddle::DataType t2) {
  return {t1};
}

PD_BUILD_OP(rbox_iou)
    .Inputs({"RBox1", "RBox2"})
    .Outputs({"Output"})
    .SetKernelFn(PD_KERNEL(RboxIouForward))
    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));


================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
================================================
//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/

#include "paddle/extension.h"
#include "rbox_iou_utils.h"

// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;

template <typename T>
__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
                                     const T *rbox1_data_ptr,
                                     const T *rbox2_data_ptr,
                                     T *output_data_ptr) {

  // get row_start and col_start
  const int rbox1_block_idx = blockIdx.x * blockDim.x;
  const int rbox2_block_idx = blockIdx.y * blockDim.y;

  const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
  const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);

  __shared__ T block_boxes1[BLOCK_DIM_X * 5];
  __shared__ T block_boxes2[BLOCK_DIM_Y * 5];

  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
  if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
    block_boxes1[threadIdx.x * 5 + 0] =
        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
    block_boxes1[threadIdx.x * 5 + 1] =
        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
    block_boxes1[threadIdx.x * 5 + 2] =
        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
    block_boxes1[threadIdx.x * 5 + 3] =
        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
    block_boxes1[threadIdx.x * 5 + 4] =
        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
  }

  // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as
  // above: threadIdx.y == 0
  if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
    block_boxes2[threadIdx.x * 5 + 0] =
        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
    block_boxes2[threadIdx.x * 5 + 1] =
        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
    block_boxes2[threadIdx.x * 5 + 2] =
        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
    block_boxes2[threadIdx.x * 5 + 3] =
        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
    block_boxes2[threadIdx.x * 5 + 4] =
        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
  }

  // sync
  __syncthreads();

  if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
    int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +
                 threadIdx.y;
    output_data_ptr[offset] = rbox_iou_single<T>(
        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
  }
}

#define CHECK_INPUT_GPU(x)                                                     \
  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")

std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
                                               const paddle::Tensor &rbox2) {
  CHECK_INPUT_GPU(rbox1);
  CHECK_INPUT_GPU(rbox2);

  auto rbox1_num = rbox1.shape()[0];
  auto rbox2_num = rbox2.shape()[0];

  auto output =
      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());

  const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
  const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);

  dim3 blocks(blocks_x, blocks_y);
  dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);

  PD_DISPATCH_FLOATING_TYPES(
      rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
        rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
            rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
            output.data<data_t>());
      }));

  return {output};
}


================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
================================================
//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/

#pragma once

#include <cassert>
#include <cmath>
#include <vector>

#ifdef __CUDACC__
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif

namespace {

template <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; };

template <typename T> struct Point {
  T x, y;
  HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
  HOST_DEVICE_INLINE Point operator+(const Point &p) const {
    return Point(x + p.x, y + p.y);
  }
  HOST_DEVICE_INLINE Point &operator+=(const Point &p) {
    x += p.x;
    y += p.y;
    return *this;
  }
  HOST_DEVICE_INLINE Point operator-(const Point &p) const {
    return Point(x - p.x, y - p.y);
  }
  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
    return Point(x * coeff, y * coeff);
  }
};

template <typename T>
HOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) {
  return A.x * B.x + A.y * B.y;
}

template <typename T>
HOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) {
  return A.x * B.y - B.x * A.y;
}

template <typename T>
HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box,
                                             Point<T> (&pts)[4]) {
  // M_PI / 180. == 0.01745329251
  // double theta = box.a * 0.01745329251;
  // MODIFIED
  double theta = box.a;
  T cosTheta2 = (T)cos(theta) * 0.5f;
  T sinTheta2 = (T)sin(theta) * 0.5f;

  // y: top --> down; x: left --> right
  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
  pts[2].x = 2 * box.x_ctr - pts[0].x;
  pts[2].y = 2 * box.y_ctr - pts[0].y;
  pts[3].x = 2 * box.x_ctr - pts[1].x;
  pts[3].y = 2 * box.y_ctr - pts[1].y;
}

template <typename T>
HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
                                               const Point<T> (&pts2)[4],
                                               Point<T> (&intersections)[24]) {
  // Line vector
  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
  Point<T> vec1[4], vec2[4];
  for (int i = 0; i < 4; i++) {
    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
  }

  // Line test - test all line combos for intersection
  int num = 0; // number of intersections
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
      // Solve for 2x2 Ax=b
      T det = cross_2d<T>(vec2[j], vec1[i]);

      // This takes care of parallel lines
      if (fabs(det) <= 1e-14) {
        continue;
      }

      auto vec12 = pts2[j] - pts1[i];

      T t1 = cross_2d<T>(vec2[j], vec12) / det;
      T t2 = cross_2d<T>(vec1[i], vec12) / det;

      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
        intersections[num++] = pts1[i] + vec1[i] * t1;
      }
    }
  }

  // Check for vertices of rect1 inside rect2
  {
    const auto &AB = vec2[0];
    const auto &DA = vec2[3];
    auto ABdotAB = dot_2d<T>(AB, AB);
    auto ADdotAD = dot_2d<T>(DA, DA);
    for (int i = 0; i < 4; i++) {
      // assume ABCD is the rectangle, and P is the point to be judged
      // P is inside ABCD iff. P's projection on AB lies within AB
      // and P's projection on AD lies within AD

      auto AP = pts1[i] - pts2[0];

      auto APdotAB = dot_2d<T>(AP, AB);
      auto APdotAD = -dot_2d<T>(AP, DA);

      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
          (APdotAD <= ADdotAD)) {
        intersections[num++] = pts1[i];
      }
    }
  }

  // Reverse the check - check for vertices of rect2 inside rect1
  {
    const auto &AB = vec1[0];
    const auto &DA = vec1[3];
    auto ABdotAB = dot_2d<T>(AB, AB);
    auto ADdotAD = dot_2d<T>(DA, DA);
    for (int i = 0; i < 4; i++) {
      auto AP = pts2[i] - pts1[0];

      auto APdotAB = dot_2d<T>(AP, AB);
      auto APdotAD = -dot_2d<T>(AP, DA);

      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
          (APdotAD <= ADdotAD)) {
        intersections[num++] = pts2[i];
      }
    }
  }

  return num;
}

template <typename T>
HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                                          const int &num_in, Point<T> (&q)[24],
                                          bool shift_to_zero = false) {
  assert(num_in >= 2);

  // Step 1:
  // Find point with minimum y
  // if more than 1 points have the same minimum y,
  // pick the one with the minimum x.
  int t = 0;
  for (int i = 1; i < num_in; i++) {
    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
      t = i;
    }
  }
  auto &start = p[t]; // starting point

  // Step 2:
  // Subtract starting point from every points (for sorting in the next step)
  for (int i = 0; i < num_in; i++) {
    q[i] = p[i] - start;
  }

  // Swap the starting point to position 0
  auto tmp = q[0];
  q[0] = q[t];
  q[t] = tmp;

  // Step 3:
  // Sort point 1 ~ num_in according to their relative cross-product values
  // (essentially sorting according to angles)
  // If the angles are the same, sort according to their distance to origin
  T dist[24];
  for (int i = 0; i < num_in; i++) {
    dist[i] = dot_2d<T>(q[i], q[i]);
  }

#ifdef __CUDACC__
  // CUDA version
  // In the future, we can potentially use thrust
  // for sorting here to improve speed (though not guaranteed)
  for (int i = 1; i < num_in - 1; i++) {
    for (int j = i + 1; j < num_in; j++) {
      T crossProduct = cross_2d<T>(q[i], q[j]);
      if ((crossProduct < -1e-6) ||
          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
        auto q_tmp = q[i];
        q[i] = q[j];
        q[j] = q_tmp;
        auto dist_tmp = dist[i];
        dist[i] = dist[j];
        dist[j] = dist_tmp;
      }
    }
  }
#else
  // CPU version
  std::sort(q + 1, q + num_in,
            [](const Point<T> &A, const Point<T> &B) -> bool {
              T temp = cross_2d<T>(A, B);
              if (fabs(temp) < 1e-6) {
                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
              } else {
                return temp > 0;
              }
            });
#endif

  // Step 4:
  // Make sure there are at least 2 points (that don't overlap with each other)
  // in the stack
  int k; // index of the non-overlapped second point
  for (k = 1; k < num_in; k++) {
    if (dist[k] > 1e-8) {
      break;
    }
  }
  if (k == num_in) {
    // We reach the end, which means the convex hull is just one point
    q[0] = p[t];
    return 1;
  }
  q[1] = q[k];
  int m = 2; // 2 points in the stack
  // Step 5:
  // Finally we can start the scanning process.
  // When a non-convex relationship between the 3 points is found
  // (either concave shape or duplicated points),
  // we pop the previous point from the stack
  // until the 3-point relationship is convex again, or
  // until the stack only contains two points
  for (int i = k + 1; i < num_in; i++) {
    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
      m--;
    }
    q[m++] = q[i];
  }

  // Step 6 (Optional):
  // In general sense we need the original coordinates, so we
  // need to shift the points back (reverting Step 2)
  // But if we're only interested in getting the area/perimeter of the shape
  // We can simply return.
  if (!shift_to_zero) {
    for (int i = 0; i < m; i++) {
      q[i] += start;
    }
  }

  return m;
}

template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) {
  if (m <= 2) {
    return 0;
  }

  T area = 0;
  for (int i = 1; i < m - 1; i++) {
    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
  }

  return area / 2.0;
}

template <typename T>
HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1,
                                         const RotatedBox<T> &box2) {
  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
  // from rotated_rect_intersection_pts
  Point<T> intersectPts[24], orderedPts[24];

  Point<T> pts1[4];
  Point<T> pts2[4];
  get_rotated_vertices<T>(box1, pts1);
  get_rotated_vertices<T>(box2, pts2);

  int num = get_intersection_points<T>(pts1, pts2, intersectPts);

  if (num <= 2) {
    return 0.0;
  }

  // Convex Hull to order the intersection points in clockwise order and find
  // the contour area.
  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
  return polygon_area<T>(orderedPts, num_convex);
}

} // namespace

template <typename T>
HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
                                     T const *const box2_raw) {
  // shift center to the middle point to achieve higher precision in result
  RotatedBox<T> box1, box2;
  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
  box1.x_ctr = box1_raw[0] - center_shift_x;
  box1.y_ctr = box1_raw[1] - center_shift_y;
  box1.w = box1_raw[2];
  box1.h = box1_raw[3];
  box1.a = box1_raw[4];
  box2.x_ctr = box2_raw[0] - center_shift_x;
  box2.y_ctr = box2_raw[1] - center_shift_y;
  box2.w = box2_raw[2];
  box2.h = box2_raw[3];
  box2.a = box2_raw[4];

  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
    return 0.f;
  }
  const T area1 = box1.w * box1.h;
  const T area2 = box2.w * box2.h;

  const T intersection = rboxes_intersection<T>(box1, box2);
  const T iou = intersection / (area1 + area2 - intersection);
  return iou;
}

/**
   Computes ceil(a / b)
*/

HOST_DEVICE inline int CeilDiv(const int a, const int b) {
  return (a + b - 1) / b;
}

================================================
FILE: ppdet/ext_op/setup.py
================================================
import os
import glob
import paddle
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup


def get_extensions():
    root_dir = os.path.dirname(os.path.abspath(__file__))
    ext_root_dir = os.path.join(root_dir, 'csrc')
    sources = []
    for ext_name in os.listdir(ext_root_dir):
        ext_dir = os.path.join(ext_root_dir, ext_name)
        source = glob.glob(os.path.join(ext_dir, '*.cc'))
        kwargs = dict()
        if paddle.device.is_compiled_with_cuda():
            source += glob.glob(os.path.join(ext_dir, '*.cu'))

        if not source:
            continue

        sources += source

    if paddle.device.is_compiled_with_cuda():
        extension = CUDAExtension(
            sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']})
    else:
        extension = CppExtension(sources)

    return extension


if __name__ == "__main__":
    setup(name='ext_op', ext_modules=get_extensions())


================================================
FILE: ppdet/ext_op/unittest/test_matched_rbox_iou.py
================================================
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest

from ext_op import matched_rbox_iou


def rbox2poly_single(rrect, get_best_begin_point=False):
    """
    rrect:[x_ctr,y_ctr,w,h,angle]
    to
    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
    """
    x_ctr, y_ctr, width, height, angle = rrect[:5]
    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
    # rect 2x4
    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
    R = np.array([[np.cos(angle), -np.sin(angle)],
                  [np.sin(angle), np.cos(angle)]])
    # poly
    poly = R.dot(rect)
    x0, x1, x2, x3 = poly[0, :4] + x_ctr
    y0, y1, y2, y3 = poly[1, :4] + y_ctr
    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
    return poly


def intersection(g, p):
    """
    Intersection.
    """

    g = g[:8].reshape((4, 2))
    p = p[:8].reshape((4, 2))

    a = g
    b = p

    use_filter = True
    if use_filter:
        # step1:
        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
            return 0.
        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
            return 0.

    g = Polygon(g)
    p = Polygon(p)
    if not g.is_valid or not p.is_valid:
        return 0

    inter = Polygon(g).intersection(Polygon(p)).area
    union = g.area + p.area - inter
    if union == 0:
        return 0
    else:
        return inter / union


def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
    """

    Args:
        anchors: [M, 5]  x1,y1,x2,y2,angle
        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle

    Returns:
        macthed_iou: [M]
    """
    assert anchors.shape[1] == 5
    assert gt_bboxes.shape[1] == 5

    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
    anchors_ploy = [rbox2poly_single(e) for e in anchors]

    num = len(anchors_ploy)
    iou = np.zeros((num, ), dtype=np.float64)

    start_time = time.time()
    for i in range(num):
        try:
            iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i])
        except Exception as e:
            print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i],
                  'anchors_ploy[j]', anchors_ploy[i], e)
    return iou


def gen_sample(n):
    rbox = np.random.rand(n, 5)
    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
    rbox[:, 4] = rbox[:, 4] - 0.5
    return rbox


class MatchedRBoxIoUTest(unittest.TestCase):
    def setUp(self):
        self.initTestCase()
        self.rbox1 = gen_sample(self.n)
        self.rbox2 = gen_sample(self.n)

    def initTestCase(self):
        self.n = 1000

    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)

    def get_places(self):
        places = [paddle.CPUPlace()]
        if paddle.device.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))

        return places

    def check_output(self, place):
        paddle.disable_static()
        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
        actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy()
        poly_rbox1 = self.rbox1
        poly_rbox2 = self.rbox2
        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
        expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
        self.assertAllClose(
            actual_t,
            expect_t,
            msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
                str(place), str(expect_t), str(actual_t)))

    def test_output(self):
        places = self.get_places()
        for place in places:
            self.check_output(place)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: ppdet/ext_op/unittest/test_rbox_iou.py
================================================
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest

from ext_op import rbox_iou


def rbox2poly_single(rrect, get_best_begin_point=False):
    """
    rrect:[x_ctr,y_ctr,w,h,angle]
    to
    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
    """
    x_ctr, y_ctr, width, height, angle = rrect[:5]
    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
    # rect 2x4
    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
    R = np.array([[np.cos(angle), -np.sin(angle)],
                  [np.sin(angle), np.cos(angle)]])
    # poly
    poly = R.dot(rect)
    x0, x1, x2, x3 = poly[0, :4] + x_ctr
    y0, y1, y2, y3 = poly[1, :4] + y_ctr
    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
    return poly


def intersection(g, p):
    """
    Intersection.
    """

    g = g[:8].reshape((4, 2))
    p = p[:8].reshape((4, 2))

    a = g
    b = p

    use_filter = True
    if use_filter:
        # step1:
        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
            return 0.
        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
            return 0.

    g = Polygon(g)
    p = Polygon(p)
    if not g.is_valid or not p.is_valid:
        return 0

    inter = Polygon(g).intersection(Polygon(p)).area
    union = g.area + p.area - inter
    if union == 0:
        return 0
    else:
        return inter / union


def rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
    """

    Args:
        anchors: [NA, 5]  x1,y1,x2,y2,angle
        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle

    Returns:
        iou: [NA, M]
    """
    assert anchors.shape[1] == 5
    assert gt_bboxes.shape[1] == 5

    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
    anchors_ploy = [rbox2poly_single(e) for e in anchors]

    num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy)
    iou = np.zeros((num_anchors, num_gt), dtype=np.float64)

    start_time = time.time()
    for i in range(num_anchors):
        for j in range(num_gt):
            try:
                iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j])
            except Exception as e:
                print('cur anchors_ploy[i]', anchors_ploy[i],
                      'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e)
    return iou


def gen_sample(n):
    rbox = np.random.rand(n, 5)
    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
    rbox[:, 4] = rbox[:, 4] - 0.5
    return rbox


class RBoxIoUTest(unittest.TestCase):
    def setUp(self):
        self.initTestCase()
        self.rbox1 = gen_sample(self.n)
        self.rbox2 = gen_sample(self.m)

    def initTestCase(self):
        self.n = 13000
        self.m = 7

    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)

    def get_places(self):
        places = [paddle.CPUPlace()]
        if paddle.device.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))

        return places

    def check_output(self, place):
        paddle.disable_static()
        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
        actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy()
        poly_rbox1 = self.rbox1
        poly_rbox2 = self.rbox2
        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
        expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
        self.assertAllClose(
            actual_t,
            expect_t,
            msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
                str(place), str(expect_t), str(actual_t)))

    def test_output(self):
        places = self.get_places()
        for place in places:
            self.check_output(place)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: ppdet/metrics/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import metrics
from . import keypoint_metrics

from .metrics import *
from .keypoint_metrics import *
from .pose3d_metrics import *

__all__ = metrics.__all__ + keypoint_metrics.__all__

from . import mot_metrics
from .mot_metrics import *
__all__ = metrics.__all__ + mot_metrics.__all__

from . import mcmot_metrics
from .mcmot_metrics import *
__all__ = metrics.__all__ + mcmot_metrics.__all__

from . import culane_metrics
from .culane_metrics import *
__all__ = metrics.__all__ + culane_metrics.__all__

================================================
FILE: ppdet/metrics/coco_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import numpy as np
import itertools

from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
from ppdet.metrics.map_utils import draw_pr_curve

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


def get_infer_results(outs, catid, bias=0, save_threshold=0):
    """
    Get result at the stage of inference.
    The output format is dictionary containing bbox or mask result.

    For example, bbox result is a list and each element contains
    image_id, category_id, bbox and score.
    """
    if outs is None or len(outs) == 0:
        raise ValueError(
            'The number of valid detection result if zero. Please use reasonable model and check input data.'
        )

    im_id = outs['im_id']
    im_file = outs['im_file'] if 'im_file' in outs else None

    infer_res = {}
    if 'bbox' in outs:
        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
            infer_res['bbox'] = get_det_poly_res(
                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
        else:
            infer_res['bbox'] = get_det_res(
                outs['bbox'],
                outs['bbox_num'],
                im_id,
                catid,
                bias=bias,
                im_file=im_file,
                save_threshold=save_threshold)

    if 'mask' in outs:
        # mask post process
        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
                                        outs['bbox_num'], im_id, catid)

    if 'segm' in outs:
        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)

    if 'keypoint' in outs:
        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
        outs['bbox_num'] = [len(infer_res['keypoint'])]

    if 'pose3d' in outs:
        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
        outs['bbox_num'] = [len(infer_res['pose3d'])]

    return infer_res


def cocoapi_eval(jsonfile,
                 style,
                 coco_gt=None,
                 anno_file=None,
                 max_dets=(100, 300, 1000),
                 classwise=False,
                 sigmas=None,
                 use_area=True):
    """
    Args:
        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
        coco_gt (str): Whether to load COCOAPI through anno_file,
                 eg: coco_gt = COCO(anno_file)
        anno_file (str): COCO annotations file.
        max_dets (tuple): COCO evaluation maxDets.
        classwise (bool): Whether per-category AP and draw P-R Curve or not.
        sigmas (nparray): keypoint labelling sigmas.
        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
                         do not have 'area', please set use_area=False.
    """
    assert coco_gt != None or anno_file != None
    if style == 'keypoints_crowd':
        #please install xtcocotools==1.6
        from xtcocotools.coco import COCO
        from xtcocotools.cocoeval import COCOeval
    else:
        from pycocotools.coco import COCO
        try:
            from .fast_cocoeval import FastCOCOeval as COCOeval
        except:
            from pycocotools.cocoeval import COCOeval

    if coco_gt == None:
        coco_gt = COCO(anno_file)
    logger.info("Start evaluate...")
    coco_dt = coco_gt.loadRes(jsonfile)
    if style == 'proposal':
        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
        coco_eval.params.useCats = 0
        coco_eval.params.maxDets = list(max_dets)
    elif style == 'keypoints_crowd':
        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
    else:
        coco_eval = COCOeval(coco_gt, coco_dt, style)
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()
    if classwise:
        # Compute per-category AP and PR curve
        try:
            from terminaltables import AsciiTable
        except Exception as e:
            logger.error(
                'terminaltables not found, plaese install terminaltables. '
                'for example: `pip install terminaltables`.')
            raise e
        precisions = coco_eval.eval['precision']
        cat_ids = coco_gt.getCatIds()
        # precision: (iou, recall, cls, area range, max dets)
        assert len(cat_ids) == precisions.shape[2]
        results_per_category = []
        for idx, catId in enumerate(cat_ids):
            # area range index 0: all area ranges
            # max dets index -1: typically 100 per image
            nm = coco_gt.loadCats(catId)[0]
            precision = precisions[:, :, idx, 0, -1]
            precision = precision[precision > -1]
            if precision.size:
                ap = np.mean(precision)
            else:
                ap = float('nan')
            results_per_category.append(
                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
            pr_array = precisions[0, :, idx, 0, 2]
            recall_array = np.arange(0.0, 1.01, 0.01)
            draw_pr_curve(
                pr_array,
                recall_array,
                out_dir=style + '_pr_curve',
                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))

        num_columns = min(6, len(results_per_category) * 2)
        results_flatten = list(itertools.chain(*results_per_category))
        headers = ['category', 'AP'] * (num_columns // 2)
        results_2d = itertools.zip_longest(
            *[results_flatten[i::num_columns] for i in range(num_columns)])
        table_data = [headers]
        table_data += [result for result in results_2d]
        table = AsciiTable(table_data)
        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
        logger.info("per-category PR curve has output to {} folder.".format(
            style + '_pr_curve'))
    # flush coco evaluation result
    sys.stdout.flush()
    return coco_eval.stats


def json_eval_results(metric, json_directory, dataset):
    """
    cocoapi eval with already exists proposal.json, bbox.json or mask.json
    """
    assert metric == 'COCO'
    anno_file = dataset.get_anno()
    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
    if json_directory:
        assert os.path.exists(
            json_directory), "The json directory:{} does not exist".format(
                json_directory)
        for k, v in enumerate(json_file_list):
            json_file_list[k] = os.path.join(str(json_directory), v)

    coco_eval_style = ['proposal', 'bbox', 'segm']
    for i, v_json in enumerate(json_file_list):
        if os.path.exists(v_json):
            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
        else:
            logger.info("{} not exists!".format(v_json))


================================================
FILE: ppdet/metrics/culane_metrics.py
================================================
import os
import cv2
import numpy as np
import os.path as osp
from functools import partial
from .metrics import Metric
from scipy.interpolate import splprep, splev
from scipy.optimize import linear_sum_assignment
from shapely.geometry import LineString, Polygon
from ppdet.utils.logger import setup_logger

logger = setup_logger(__name__)

__all__ = [
    'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp',
    'culane_metric', 'load_culane_img_data', 'load_culane_data',
    'eval_predictions', "CULaneMetric"
]

LIST_FILE = {
    'train': 'list/train_gt.txt',
    'val': 'list/val.txt',
    'test': 'list/test.txt',
}

CATEGORYS = {
    'normal': 'list/test_split/test0_normal.txt',
    'crowd': 'list/test_split/test1_crowd.txt',
    'hlight': 'list/test_split/test2_hlight.txt',
    'shadow': 'list/test_split/test3_shadow.txt',
    'noline': 'list/test_split/test4_noline.txt',
    'arrow': 'list/test_split/test5_arrow.txt',
    'curve': 'list/test_split/test6_curve.txt',
    'cross': 'list/test_split/test7_cross.txt',
    'night': 'list/test_split/test8_night.txt',
}


def draw_lane(lane, img=None, img_shape=None, width=30):
    if img is None:
        img = np.zeros(img_shape, dtype=np.uint8)
    lane = lane.astype(np.int32)
    for p1, p2 in zip(lane[:-1], lane[1:]):
        cv2.line(
            img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width)
    return img


def discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
    xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs]
    ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys]

    ious = np.zeros((len(xs), len(ys)))
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            ious[i, j] = (x & y).sum() / (x | y).sum()
    return ious


def continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
    h, w, _ = img_shape
    image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)])
    xs = [
        LineString(lane).buffer(
            distance=width / 2., cap_style=1, join_style=2).intersection(image)
        for lane in xs
    ]
    ys = [
        LineString(lane).buffer(
            distance=width / 2., cap_style=1, join_style=2).intersection(image)
        for lane in ys
    ]

    ious = np.zeros((len(xs), len(ys)))
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            ious[i, j] = x.intersection(y).area / x.union(y).area

    return ious


def interp(points, n=50):
    x = [x for x, _ in points]
    y = [y for _, y in points]
    tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1))

    u = np.linspace(0., 1., num=(len(u) - 1) * n + 1)
    return np.array(splev(u, tck)).T


def culane_metric(pred,
                  anno,
                  width=30,
                  iou_thresholds=[0.5],
                  official=True,
                  img_shape=(590, 1640, 3)):
    _metric = {}
    for thr in iou_thresholds:
        tp = 0
        fp = 0 if len(anno) != 0 else len(pred)
        fn = 0 if len(pred) != 0 else len(anno)
        _metric[thr] = [tp, fp, fn]

    interp_pred = np.array(
        [interp(
            pred_lane, n=5) for pred_lane in pred], dtype=object)  # (4, 50, 2)
    interp_anno = np.array(
        [interp(
            anno_lane, n=5) for anno_lane in anno], dtype=object)  # (4, 50, 2)

    if official:
        ious = discrete_cross_iou(
            interp_pred, interp_anno, width=width, img_shape=img_shape)
    else:
        ious = continuous_cross_iou(
            interp_pred, interp_anno, width=width, img_shape=img_shape)

    row_ind, col_ind = linear_sum_assignment(1 - ious)

    _metric = {}
    for thr in iou_thresholds:
        tp = int((ious[row_ind, col_ind] > thr).sum())
        fp = len(pred) - tp
        fn = len(anno) - tp
        _metric[thr] = [tp, fp, fn]
    return _metric


def load_culane_img_data(path):
    with open(path, 'r') as data_file:
        img_data = data_file.readlines()
    img_data = [line.split() for line in img_data]
    img_data = [list(map(float, lane)) for lane in img_data]
    img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)]
                for lane in img_data]
    img_data = [lane for lane in img_data if len(lane) >= 2]

    return img_data


def load_culane_data(data_dir, file_list_path):
    with open(file_list_path, 'r') as file_list:
        filepaths = [
            os.path.join(data_dir,
                         line[1 if line[0] == '/' else 0:].rstrip().replace(
                             '.jpg', '.lines.txt'))
            for line in file_list.readlines()
        ]

    data = []
    for path in filepaths:
        img_data = load_culane_img_data(path)
        data.append(img_data)

    return data


def eval_predictions(pred_dir,
                     anno_dir,
                     list_path,
                     iou_thresholds=[0.5],
                     width=30,
                     official=True,
                     sequential=False):
    logger.info('Calculating metric for List: {}'.format(list_path))
    predictions = load_culane_data(pred_dir, list_path)
    annotations = load_culane_data(anno_dir, list_path)
    img_shape = (590, 1640, 3)
    if sequential:
        results = map(partial(
            culane_metric,
            width=width,
            official=official,
            iou_thresholds=iou_thresholds,
            img_shape=img_shape),
                      predictions,
                      annotations)
    else:
        from multiprocessing import Pool, cpu_count
        from itertools import repeat
        with Pool(cpu_count()) as p:
            results = p.starmap(culane_metric,
                                zip(predictions, annotations,
                                    repeat(width),
                                    repeat(iou_thresholds),
                                    repeat(official), repeat(img_shape)))

    mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0
    ret = {}
    for thr in iou_thresholds:
        tp = sum(m[thr][0] for m in results)
        fp = sum(m[thr][1] for m in results)
        fn = sum(m[thr][2] for m in results)
        precision = float(tp) / (tp + fp) if tp != 0 else 0
        recall = float(tp) / (tp + fn) if tp != 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0
        logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},'
                    'precision: {}, recall: {}, f1: {}'.format(
                        thr, tp, fp, fn, precision, recall, f1))
        mean_f1 += f1 / len(iou_thresholds)
        mean_prec += precision / len(iou_thresholds)
        mean_recall += recall / len(iou_thresholds)
        total_tp += tp
        total_fp += fp
        total_fn += fn
        ret[thr] = {
            'TP': tp,
            'FP': fp,
            'FN': fn,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }
    if len(iou_thresholds) > 2:
        logger.info(
            'mean result, total_tp: {}, total_fp: {}, total_fn: {},'
            'precision: {}, recall: {}, f1: {}'.format(
                total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1))
        ret['mean'] = {
            'TP': total_tp,
            'FP': total_fp,
            'FN': total_fn,
            'Precision': mean_prec,
            'Recall': mean_recall,
            'F1': mean_f1
        }
    return ret


class CULaneMetric(Metric):
    def __init__(self,
                 cfg,
                 output_eval=None,
                 split="test",
                 dataset_dir="dataset/CULane/"):
        super(CULaneMetric, self).__init__()
        self.output_eval = "evaluation" if output_eval is None else output_eval
        self.dataset_dir = dataset_dir
        self.split = split
        self.list_path = osp.join(dataset_dir, LIST_FILE[split])
        self.predictions = []
        self.img_names = []
        self.lanes = []
        self.eval_results = {}
        self.cfg = cfg
        self.reset()

    def reset(self):
        self.predictions = []
        self.img_names = []
        self.lanes = []
        self.eval_results = {}

    def get_prediction_string(self, pred):
        ys = np.arange(270, 590, 8) / self.cfg.ori_img_h
        out = []
        for lane in pred:
            xs = lane(ys)
            valid_mask = (xs >= 0) & (xs < 1)
            xs = xs * self.cfg.ori_img_w
            lane_xs = xs[valid_mask]
            lane_ys = ys[valid_mask] * self.cfg.ori_img_h
            lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1]
            lane_str = ' '.join([
                '{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys)
            ])
            if lane_str != '':
                out.append(lane_str)

        return '\n'.join(out)

    def accumulate(self):
        loss_lines = [[], [], [], []]
        for idx, pred in enumerate(self.predictions):
            output_dir = os.path.join(self.output_eval,
                                      os.path.dirname(self.img_names[idx]))
            output_filename = os.path.basename(self.img_names[
                idx])[:-3] + 'lines.txt'
            os.makedirs(output_dir, exist_ok=True)
            output = self.get_prediction_string(pred)

            # store loss lines
            lanes = self.lanes[idx]
            if len(lanes) - len(pred) in [1, 2, 3, 4]:
                loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[
                    idx])

            with open(os.path.join(output_dir, output_filename),
                      'w') as out_file:
                out_file.write(output)

        for i, names in enumerate(loss_lines):
            with open(
                    os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)),
                    'w') as f:
                for name in names:
                    f.write(name + '\n')

        for cate, cate_file in CATEGORYS.items():
            result = eval_predictions(
                self.output_eval,
                self.dataset_dir,
                os.path.join(self.dataset_dir, cate_file),
                iou_thresholds=[0.5],
                official=True)

        result = eval_predictions(
            self.output_eval,
            self.dataset_dir,
            self.list_path,
            iou_thresholds=np.linspace(0.5, 0.95, 10),
            official=True)
        self.eval_results['F1@50'] = result[0.5]['F1']
        self.eval_results['result'] = result

    def update(self, inputs, outputs):
        assert len(inputs['img_name']) == len(outputs['lanes'])
        self.predictions.extend(outputs['lanes'])
        self.img_names.extend(inputs['img_name'])
        self.lanes.extend(inputs['lane_line'])

    def log(self):
        logger.info(self.eval_results)

    # abstract method for getting metric results
    def get_results(self):
        return self.eval_results


================================================
FILE: ppdet/metrics/fast_cocoeval/README.md
================================================
# COCOeval C++ 扩展编译

## 安装
```
cd ext
python setup.py install
```


================================================
FILE: ppdet/metrics/fast_cocoeval/__init__.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import fast_cocoeval

from .fast_cocoeval import *


================================================
FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.cc
================================================
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/

#include "cocoeval.h"
#include <time.h>
#include <algorithm>
#include <cstdint>
#include <numeric>

using namespace pybind11::literals;

// Sort detections from highest score to lowest, such that
// detection_instances[detection_sorted_indices[t]] >=
// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
// original COCO API
void SortInstancesByDetectionScore(
    const std::vector<InstanceAnnotation>& detection_instances,
    std::vector<uint64_t>* detection_sorted_indices) {
  detection_sorted_indices->resize(detection_instances.size());
  std::iota(
      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
  std::stable_sort(
      detection_sorted_indices->begin(),
      detection_sorted_indices->end(),
      [&detection_instances](size_t j1, size_t j2) {
        return detection_instances[j1].score > detection_instances[j2].score;
      });
}

// Partition the ground truth objects based on whether or not to ignore them
// based on area
void SortInstancesByIgnore(
    const std::array<double, 2>& area_range,
    const std::vector<InstanceAnnotation>& ground_truth_instances,
    std::vector<uint64_t>* ground_truth_sorted_indices,
    std::vector<bool>* ignores) {
  ignores->clear();
  ignores->reserve(ground_truth_instances.size());
  for (auto o : ground_truth_instances) {
    ignores->push_back(
        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
  }

  ground_truth_sorted_indices->resize(ground_truth_instances.size());
  std::iota(
      ground_truth_sorted_indices->begin(),
      ground_truth_sorted_indices->end(),
      0);
  std::stable_sort(
      ground_truth_sorted_indices->begin(),
      ground_truth_sorted_indices->end(),
      [&ignores](size_t j1, size_t j2) {
        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
      });
}

// For each IOU threshold, greedily match each detected instance to a ground
// truth instance (if possible) and store the results
void MatchDetectionsToGroundTruth(
    const std::vector<InstanceAnnotation>& detection_instances,
    const std::vector<uint64_t>& detection_sorted_indices,
    const std::vector<InstanceAnnotation>& ground_truth_instances,
    const std::vector<uint64_t>& ground_truth_sorted_indices,
    const std::vector<bool>& ignores,
    const std::vector<std::vector<double>>& ious,
    const std::vector<double>& iou_thresholds,
    const std::array<double, 2>& area_range,
    ImageEvaluation* results) {
  // Initialize memory to store return data matches and ignore
  const int num_iou_thresholds = iou_thresholds.size();
  const int num_ground_truth = ground_truth_sorted_indices.size();
  const int num_detections = detection_sorted_indices.size();
  std::vector<uint64_t> ground_truth_matches(
      num_iou_thresholds * num_ground_truth, 0);
  std::vector<uint64_t>& detection_matches = results->detection_matches;
  std::vector<bool>& detection_ignores = results->detection_ignores;
  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
  detection_matches.resize(num_iou_thresholds * num_detections, 0);
  detection_ignores.resize(num_iou_thresholds * num_detections, false);
  ground_truth_ignores.resize(num_ground_truth);
  for (auto g = 0; g < num_ground_truth; ++g) {
    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
  }

  for (auto t = 0; t < num_iou_thresholds; ++t) {
    for (auto d = 0; d < num_detections; ++d) {
      // information about best match so far (match=-1 -> unmatched)
      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
      int match = -1;
      for (auto g = 0; g < num_ground_truth; ++g) {
        // if this ground truth instance is already matched and not a
        // crowd, it cannot be matched to another detection
        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
          continue;
        }

        // if detected instance matched to a regular ground truth
        // instance, we can break on the first ground truth instance
        // tagged as ignore (because they are sorted by the ignore tag)
        if (match >= 0 && !ground_truth_ignores[match] &&
            ground_truth_ignores[g]) {
          break;
        }

        // if IOU overlap is the best so far, store the match appropriately
        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
          best_iou = ious[d][ground_truth_sorted_indices[g]];
          match = g;
        }
      }
      // if match was made, store id of match for both detection and
      // ground truth
      if (match >= 0) {
        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
        detection_matches[t * num_detections + d] =
            ground_truth_instances[ground_truth_sorted_indices[match]].id;
        ground_truth_matches[t * num_ground_truth + match] =
            detection_instances[detection_sorted_indices[d]].id;
      }

      // set unmatched detections outside of area range to ignore
      const InstanceAnnotation& detection =
          detection_instances[detection_sorted_indices[d]];
      detection_ignores[t * num_detections + d] =
          detection_ignores[t * num_detections + d] ||
          (detection_matches[t * num_detections + d] == 0 &&
           (detection.area < area_range[0] || detection.area > area_range[1]));
    }
  }

  // store detection score results
  results->detection_scores.resize(detection_sorted_indices.size());
  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
    results->detection_scores[d] =
        detection_instances[detection_sorted_indices[d]].score;
  }
}

std::vector<ImageEvaluation> EvaluateImages(
    const std::vector<std::array<double, 2>>& area_ranges,
    int max_detections,
    const std::vector<double>& iou_thresholds,
    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
    const ImageCategoryInstances<InstanceAnnotation>&
        image_category_ground_truth_instances,
    const ImageCategoryInstances<InstanceAnnotation>&
        image_category_detection_instances) {
  const int num_area_ranges = area_ranges.size();
  const int num_images = image_category_ground_truth_instances.size();
  const int num_categories =
      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
  std::vector<uint64_t> detection_sorted_indices;
  std::vector<uint64_t> ground_truth_sorted_indices;
  std::vector<bool> ignores;
  std::vector<ImageEvaluation> results_all(
      num_images * num_area_ranges * num_categories);

  // Store results for each image, category, and area range combination. Results
  // for each IOU threshold are packed into the same ImageEvaluation object
  for (auto i = 0; i < num_images; ++i) {
    for (auto c = 0; c < num_categories; ++c) {
      const std::vector<InstanceAnnotation>& ground_truth_instances =
          image_category_ground_truth_instances[i][c];
      const std::vector<InstanceAnnotation>& detection_instances =
          image_category_detection_instances[i][c];

      SortInstancesByDetectionScore(
          detection_instances, &detection_sorted_indices);
      if ((int)detection_sorted_indices.size() > max_detections) {
        detection_sorted_indices.resize(max_detections);
      }

      for (size_t a = 0; a < area_ranges.size(); ++a) {
        SortInstancesByIgnore(
            area_ranges[a],
            ground_truth_instances,
            &ground_truth_sorted_indices,
            &ignores);

        MatchDetectionsToGroundTruth(
            detection_instances,
            detection_sorted_indices,
            ground_truth_instances,
            ground_truth_sorted_indices,
            ignores,
            image_category_ious[i][c],
            iou_thresholds,
            area_ranges[a],
            &results_all
                [c * num_area_ranges * num_images + a * num_images + i]);
      }
    }
  }

  return results_all;
}

// Convert a python list to a vector
template <typename T>
std::vector<T> list_to_vec(const py::list& l) {
  std::vector<T> v(py::len(l));
  for (int i = 0; i < (int)py::len(l); ++i) {
    v[i] = l[i].cast<T>();
  }
  return v;
}

// Helper function to Accumulate()
// Considers the evaluation results applicable to a particular category, area
// range, and max_detections parameter setting, which begin at
// evaluations[evaluation_index].  Extracts a sorted list of length n of all
// applicable detection instances concatenated across all images in the dataset,
// which are represented by the outputs evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices--all of which are
// length n. evaluation_indices[i] stores the applicable index into
// evaluations[] for instance i, which has detection score detection_score[i],
// and is the image_detection_indices[i]'th of the list of detections
// for the image containing i.  detection_sorted_indices[] defines a sorted
// permutation of the 3 other outputs
int BuildSortedDetectionList(
    const std::vector<ImageEvaluation>& evaluations,
    const int64_t evaluation_index,
    const int64_t num_images,
    const int max_detections,
    std::vector<uint64_t>* evaluation_indices,
    std::vector<double>* detection_scores,
    std::vector<uint64_t>* detection_sorted_indices,
    std::vector<uint64_t>* image_detection_indices) {
  assert(evaluations.size() >= evaluation_index + num_images);

  // Extract a list of object instances of the applicable category, area
  // range, and max detections requirements such that they can be sorted
  image_detection_indices->clear();
  evaluation_indices->clear();
  detection_scores->clear();
  image_detection_indices->reserve(num_images * max_detections);
  evaluation_indices->reserve(num_images * max_detections);
  detection_scores->reserve(num_images * max_detections);
  int num_valid_ground_truth = 0;
  for (auto i = 0; i < num_images; ++i) {
    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];

    for (int d = 0;
         d < (int)evaluation.detection_scores.size() && d < max_detections;
         ++d) { // detected instances
      evaluation_indices->push_back(evaluation_index + i);
      image_detection_indices->push_back(d);
      detection_scores->push_back(evaluation.detection_scores[d]);
    }
    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
      if (!ground_truth_ignore) {
        ++num_valid_ground_truth;
      }
    }
  }

  // Sort detections by decreasing score, using stable sort to match
  // python implementation
  detection_sorted_indices->resize(detection_scores->size());
  std::iota(
      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
  std::stable_sort(
      detection_sorted_indices->begin(),
      detection_sorted_indices->end(),
      [&detection_scores](size_t j1, size_t j2) {
        return (*detection_scores)[j1] > (*detection_scores)[j2];
      });

  return num_valid_ground_truth;
}

// Helper function to Accumulate()
// Compute a precision recall curve given a sorted list of detected instances
// encoded in evaluations, evaluation_indices, detection_scores,
// detection_sorted_indices, image_detection_indices (see
// BuildSortedDetectionList()). Using vectors precisions and recalls
// and temporary storage, output the results into precisions_out, recalls_out,
// and scores_out, which are large buffers containing many precion/recall curves
// for all possible parameter settings, with precisions_out_index and
// recalls_out_index defining the applicable indices to store results.
void ComputePrecisionRecallCurve(
    const int64_t precisions_out_index,
    const int64_t precisions_out_stride,
    const int64_t recalls_out_index,
    const std::vector<double>& recall_thresholds,
    const int iou_threshold_index,
    const int num_iou_thresholds,
    const int num_valid_ground_truth,
    const std::vector<ImageEvaluation>& evaluations,
    const std::vector<uint64_t>& evaluation_indices,
    const std::vector<double>& detection_scores,
    const std::vector<uint64_t>& detection_sorted_indices,
    const std::vector<uint64_t>& image_detection_indices,
    std::vector<double>* precisions,
    std::vector<double>* recalls,
    std::vector<double>* precisions_out,
    std::vector<double>* scores_out,
    std::vector<double>* recalls_out) {
  assert(recalls_out->size() > recalls_out_index);

  // Compute precision/recall for each instance in the sorted list of detections
  int64_t true_positives_sum = 0, false_positives_sum = 0;
  precisions->clear();
  recalls->clear();
  precisions->reserve(detection_sorted_indices.size());
  recalls->reserve(detection_sorted_indices.size());
  assert(!evaluations.empty() || detection_sorted_indices.empty());
  for (auto detection_sorted_index : detection_sorted_indices) {
    const ImageEvaluation& evaluation =
        evaluations[evaluation_indices[detection_sorted_index]];
    const auto num_detections =
        evaluation.detection_matches.size() / num_iou_thresholds;
    const auto detection_index = iou_threshold_index * num_detections +
        image_detection_indices[detection_sorted_index];
    assert(evaluation.detection_matches.size() > detection_index);
    assert(evaluation.detection_ignores.size() > detection_index);
    const int64_t detection_match =
        evaluation.detection_matches[detection_index];
    const bool detection_ignores =
        evaluation.detection_ignores[detection_index];
    const auto true_positive = detection_match > 0 && !detection_ignores;
    const auto false_positive = detection_match == 0 && !detection_ignores;
    if (true_positive) {
      ++true_positives_sum;
    }
    if (false_positive) {
      ++false_positives_sum;
    }

    const double recall =
        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
    recalls->push_back(recall);
    const int64_t num_valid_detections =
        true_positives_sum + false_positives_sum;
    const double precision = num_valid_detections > 0
        ? static_cast<double>(true_positives_sum) / num_valid_detections
        : 0.0;
    precisions->push_back(precision);
  }

  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;

  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
    if ((*precisions)[i] > (*precisions)[i - 1]) {
      (*precisions)[i - 1] = (*precisions)[i];
    }
  }

  // Sample the per instance precision/recall list at each recall threshold
  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
    // first index in recalls >= recall_thresholds[r]
    std::vector<double>::iterator low = std::lower_bound(
        recalls->begin(), recalls->end(), recall_thresholds[r]);
    size_t precisions_index = low - recalls->begin();

    const auto results_ind = precisions_out_index + r * precisions_out_stride;
    assert(results_ind < precisions_out->size());
    assert(results_ind < scores_out->size());
    if (precisions_index < precisions->size()) {
      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
      (*scores_out)[results_ind] =
          detection_scores[detection_sorted_indices[precisions_index]];
    } else {
      (*precisions_out)[results_ind] = 0;
      (*scores_out)[results_ind] = 0;
    }
  }
}
py::dict Accumulate(
    const py::object& params,
    const std::vector<ImageEvaluation>& evaluations) {
  const std::vector<double> recall_thresholds =
      list_to_vec<double>(params.attr("recThrs"));
  const std::vector<int> max_detections =
      list_to_vec<int>(params.attr("maxDets"));
  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
  const int num_recall_thresholds = py::len(params.attr("recThrs"));
  const int num_categories = params.attr("useCats").cast<int>() == 1
      ? py::len(params.attr("catIds"))
      : 1;
  const int num_area_ranges = py::len(params.attr("areaRng"));
  const int num_max_detections = py::len(params.attr("maxDets"));
  const int num_images = py::len(params.attr("imgIds"));

  std::vector<double> precisions_out(
      num_iou_thresholds * num_recall_thresholds * num_categories *
          num_area_ranges * num_max_detections,
      -1);
  std::vector<double> recalls_out(
      num_iou_thresholds * num_categories * num_area_ranges *
          num_max_detections,
      -1);
  std::vector<double> scores_out(
      num_iou_thresholds * num_recall_thresholds * num_categories *
          num_area_ranges * num_max_detections,
      -1);

  // Consider the list of all detected instances in the entire dataset in one
  // large list.  evaluation_indices, detection_scores,
  // image_detection_indices, and detection_sorted_indices all have the same
  // length as this list, such that each entry corresponds to one detected
  // instance
  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
  std::vector<double> detection_scores; // detection scores of each instance
  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
                                                  // instances in the dataset
  std::vector<uint64_t>
      image_detection_indices; // indices into the list of detected instances in
                               // the same image as each instance
  std::vector<double> precisions, recalls;

  for (auto c = 0; c < num_categories; ++c) {
    for (auto a = 0; a < num_area_ranges; ++a) {
      for (auto m = 0; m < num_max_detections; ++m) {
        // The COCO PythonAPI assumes evaluations[] (the return value of
        // COCOeval::EvaluateImages() is one long list storing results for each
        // combination of category, area range, and image id, with categories in
        // the outermost loop and images in the innermost loop.
        const int64_t evaluations_index =
            c * num_area_ranges * num_images + a * num_images;
        int num_valid_ground_truth = BuildSortedDetectionList(
            evaluations,
            evaluations_index,
            num_images,
            max_detections[m],
            &evaluation_indices,
            &detection_scores,
            &detection_sorted_indices,
            &image_detection_indices);

        if (num_valid_ground_truth == 0) {
          continue;
        }

        for (auto t = 0; t < num_iou_thresholds; ++t) {
          // recalls_out is a flattened vectors representing a
          // num_iou_thresholds X num_categories X num_area_ranges X
          // num_max_detections matrix
          const int64_t recalls_out_index =
              t * num_categories * num_area_ranges * num_max_detections +
              c * num_area_ranges * num_max_detections +
              a * num_max_detections + m;

          // precisions_out and scores_out are flattened vectors
          // representing a num_iou_thresholds X num_recall_thresholds X
          // num_categories X num_area_ranges X num_max_detections matrix
          const int64_t precisions_out_stride =
              num_categories * num_area_ranges * num_max_detections;
          const int64_t precisions_out_index = t * num_recall_thresholds *
                  num_categories * num_area_ranges * num_max_detections +
              c * num_area_ranges * num_max_detections +
              a * num_max_detections + m;

          ComputePrecisionRecallCurve(
              precisions_out_index,
              precisions_out_stride,
              recalls_out_index,
              recall_thresholds,
              t,
              num_iou_thresholds,
              num_valid_ground_truth,
              evaluations,
              evaluation_indices,
              detection_scores,
              detection_sorted_indices,
              image_detection_indices,
              &precisions,
              &recalls,
              &precisions_out,
              &scores_out,
              &recalls_out);
        }
      }
    }
  }

  time_t rawtime;
  struct tm local_time;
  std::array<char, 200> buffer;
  time(&rawtime);
#ifdef _WIN32
  localtime_s(&local_time, &rawtime);
#else
  localtime_r(&rawtime, &local_time);
#endif
  strftime(
      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
  return py::dict(
      "params"_a = params,
      "counts"_a = std::vector<int64_t>(
          {num_iou_thresholds,
           num_recall_thresholds,
           num_categories,
           num_area_ranges,
           num_max_detections}),
      "date"_a = buffer,
      "precision"_a = precisions_out,
      "recall"_a = recalls_out,
      "scores"_a = scores_out);
}

PYBIND11_MODULE(cocoeval_ext, m) {
  m.def("COCOevalAccumulate", &Accumulate, "Accumulate");
  m.def("COCOevalEvaluateImages", &EvaluateImages, "EvaluateImages");
  py::class_<InstanceAnnotation>(m, "InstanceAnnotation")
      .def(py::init<uint64_t, double, double, bool, bool>());
  py::class_<ImageEvaluation>(m, "ImageEvaluation")
      .def(py::init<>());
}

================================================
FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.h
================================================
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/

#pragma once

#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>
#include <vector>

namespace py = pybind11;

// Annotation data for a single object instance in an image
struct InstanceAnnotation {
  InstanceAnnotation(
      uint64_t id,
      double score,
      double area,
      bool is_crowd,
      bool ignore)
      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
  uint64_t id;
  double score = 0.;
  double area = 0.;
  bool is_crowd = false;
  bool ignore = false;
};

// Stores intermediate results for evaluating detection results for a single
// image that has D detected instances and G ground truth instances. This stores
// matches between detected and ground truth instances
struct ImageEvaluation {
  // For each of the D detected instances, the id of the matched ground truth
  // instance, or 0 if unmatched
  std::vector<uint64_t> detection_matches;

  // The detection score of each of the D detected instances
  std::vector<double> detection_scores;

  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
  // because it's outside area_range)
  std::vector<bool> ground_truth_ignores;

  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
  // because it's outside aRng)
  std::vector<bool> detection_ignores;
};

template <class T>
using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;

// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
// combination of image, category, area range settings, and IOU thresholds to
// evaluate, it matches detected instances to ground truth instances and stores
// the results into a vector of ImageEvaluation results, which will be
// interpreted by the COCOeval::Accumulate() function to produce precion-recall
// curves.  The parameters of nested vectors have the following semantics:
//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
//     detected instance and g'th ground truth instance of
//     category category_ids[c] in image image_ids[i]
//   image_category_ground_truth_instances[i][c] is a vector of ground truth
//     instances in image image_ids[i] of category category_ids[c]
//   image_category_detection_instances[i][c] is a vector of detected
//     instances in image image_ids[i] of category category_ids[c]
std::vector<ImageEvaluation> EvaluateImages(
    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
    int max_detections,
    const std::vector<double>& iou_thresholds,
    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
    const ImageCategoryInstances<InstanceAnnotation>&
        image_category_ground_truth_instances,
    const ImageCategoryInstances<InstanceAnnotation>&
        image_category_detection_instances);

// C++ implementation of COCOeval.accumulate(), which generates precision
// recall curves for each set of category, IOU threshold, detection area range,
// and max number of detections parameters.  It is assumed that the parameter
// evaluations is the return value of the functon COCOeval::EvaluateImages(),
// which was called with the same parameter settings params
py::dict Accumulate(
    const py::object& params,
    const std::vector<ImageEvaluation>& evalutations);


================================================
FILE: ppdet/metrics/fast_cocoeval/ext/setup.py
================================================
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup

ext_modules = [Pybind11Extension("cocoeval_ext", ["cocoeval.cc"])]

setup(
    name="cocoeval_ext",
    version="0.0.0",
    ext_modules=ext_modules,
    cmdclass={"build_ext": build_ext},
)


================================================
FILE: ppdet/metrics/fast_cocoeval/fast_cocoeval.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The code is based on
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py


import copy
import time

import numpy as np
from cocoeval_ext import InstanceAnnotation, COCOevalEvaluateImages, COCOevalAccumulate
from pycocotools.cocoeval import COCOeval

__all__ = ['FastCOCOeval']


class FastCOCOeval(COCOeval):
    """
    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
    and accumulate() are implemented in C++ to speedup evaluation
    """

    def evaluate(self):
        """
        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
        datastructure that isn't readable from Python but is used by a c++ implementation of
        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
        self.evalImgs because this datastructure is a computational bottleneck.
        :return: None
        """
        tic = time.time()
        print('Running per image evaluation...')
        p = self.params
        # add backward compatibility if useSegm is specified in params
        if p.useSegm is not None:
            p.iouType = "segm" if p.useSegm == 1 else "bbox"
            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
        print('Evaluate annotation type *{}*'.format(p.iouType))
        p.imgIds = list(np.unique(p.imgIds))
        if p.useCats:
            p.catIds = list(np.unique(p.catIds))
        p.maxDets = sorted(p.maxDets)
        self.params = p

        self._prepare()  # bottleneck

        # loop through images, area range, max detection number
        catIds = p.catIds if p.useCats else [-1]

        if p.iouType == "segm" or p.iouType == "bbox":
            computeIoU = self.computeIoU
        elif p.iouType == "keypoints":
            computeIoU = self.computeOks
        self.ious = {
            (imgId, catId): computeIoU(imgId, catId)
            for imgId in p.imgIds for catId in catIds
        }  # bottleneck

        maxDet = p.maxDets[-1]

        # <<<< Beginning of code differences with original COCO API
        def convert_instances_to_cpp(instances, is_det=False):
            # Convert annotations for a list of instances in an image to a format that's fast
            # to access in C++
            instances_cpp = []
            for instance in instances:
                instance_cpp = InstanceAnnotation(
                    int(instance["id"]),
                    instance["score"] if is_det else instance.get("score", 0.0),
                    instance["area"],
                    bool(instance.get("iscrowd", 0)),
                    bool(instance.get("ignore", 0)),
                )
                instances_cpp.append(instance_cpp)
            return instances_cpp

        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
        ground_truth_instances = [
            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
            for imgId in p.imgIds
        ]
        detected_instances = [
            [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
            for imgId in p.imgIds
        ]
        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]

        if not p.useCats:
            # For each image, flatten per-category lists into a single list
            ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
            detected_instances = [[[o for c in i for o in c]] for i in detected_instances]

        # Call C++ implementation of self.evaluateImgs()
        self._evalImgs_cpp = COCOevalEvaluateImages(
            p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
        )
        self._evalImgs = None

        self._paramsEval = copy.deepcopy(self.params)
        toc = time.time()
        print('DONE (t={:0.2f}s).'.format(toc-tic))
        # >>>> End of code differences with original COCO API

    def accumulate(self, p=None):
        """
        Accumulate per image evaluation results and store the result in self.eval.  Does not
        support changing parameter settings from those used by self.evaluate()
        """
        print('Accumulating evaluation results...')
        tic = time.time()
        assert hasattr(
            self, "_evalImgs_cpp"
        ), "evaluate() must be called before accmulate() is called."

        self.eval = COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)

        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
            self.eval["counts"][:1] + self.eval["counts"][2:]
        )

        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
        # num_area_ranges X num_max_detections
        self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
        toc = time.time()
        print('DONE (t={:0.2f}s).'.format( toc-tic))


================================================
FILE: ppdet/metrics/json_results.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import numpy as np


def get_det_res(bboxes,
                bbox_nums,
                image_id,
                label_to_cat_id_map,
                bias=0,
                im_file=None,
                save_threshold=0):
    det_res = []
    k = 0
    for i in range(len(bbox_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = bbox_nums[i]
        for j in range(det_nums):
            dt = bboxes[k]
            k = k + 1
            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
            if int(num_id) < 0 or score < save_threshold:
                continue
            category_id = label_to_cat_id_map[int(num_id)]
            w = xmax - xmin + bias
            h = ymax - ymin + bias
            bbox = [xmin, ymin, w, h]
            dt_res = {
                'image_id': cur_image_id,
                'category_id': category_id,
                'bbox': bbox,
                'score': score
            }
            if im_file:
                dt_res['im_file'] = im_file
            det_res.append(dt_res)
    return det_res


def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
    det_res = []
    k = 0
    for i in range(len(bbox_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = bbox_nums[i]
        for j in range(det_nums):
            dt = bboxes[k]
            k = k + 1
            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
            if int(num_id) < 0:
                continue
            category_id = label_to_cat_id_map[int(num_id)]
            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
            dt_res = {
                'image_id': cur_image_id,
                'category_id': category_id,
                'bbox': rbox,
                'score': score
            }
            det_res.append(dt_res)
    return det_res


def strip_mask(mask):
    row = mask[0, 0, :]
    col = mask[0, :, 0]
    im_h = len(col) - np.count_nonzero(col == -1)
    im_w = len(row) - np.count_nonzero(row == -1)
    return mask[:, :im_h, :im_w]


def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
    import pycocotools.mask as mask_util
    seg_res = []
    k = 0
    for i in range(len(mask_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = mask_nums[i]
        mask_i = masks[k:k + det_nums]
        mask_i = strip_mask(mask_i)
        for j in range(det_nums):
            mask = mask_i[j].astype(np.uint8)
            score = float(bboxes[k][1])
            label = int(bboxes[k][0])
            k = k + 1
            if label == -1:
                continue
            cat_id = label_to_cat_id_map[label]
            rle = mask_util.encode(
                np.array(
                    mask[:, :, None], order="F", dtype="uint8"))[0]
            if six.PY3:
                if 'counts' in rle:
                    rle['counts'] = rle['counts'].decode("utf8")
            sg_res = {
                'image_id': cur_image_id,
                'category_id': cat_id,
                'segmentation': rle,
                'score': score
            }
            seg_res.append(sg_res)
    return seg_res


def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
    import pycocotools.mask as mask_util
    segm_res = []
    # for each batch
    segms = results['segm'].astype(np.uint8)
    clsid_labels = results['cate_label']
    clsid_scores = results['cate_score']
    lengths = segms.shape[0]
    im_id = int(image_id[0][0])
    if lengths == 0 or segms is None:
        return None
    # for each sample
    for i in range(lengths - 1):
        clsid = int(clsid_labels[i])
        catid = num_id_to_cat_id_map[clsid]
        score = float(clsid_scores[i])
        mask = segms[i]
        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
        segm['counts'] = segm['counts'].decode('utf8')
        coco_res = {
            'image_id': im_id,
            'category_id': catid,
            'segmentation': segm,
            'score': score
        }
        segm_res.append(coco_res)
    return segm_res


def get_keypoint_res(results, im_id):
    anns = []
    preds = results['keypoint']
    for idx in range(im_id.shape[0]):
        image_id = im_id[idx].item()
        kpts, scores = preds[idx]
        for kpt, score in zip(kpts, scores):
            kpt = kpt.flatten()
            ann = {
                'image_id': image_id,
                'category_id': 1,  # XXX hard code
                'keypoints': kpt.tolist(),
                'score': float(score)
            }
            x = kpt[0::3]
            y = kpt[1::3]
            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
            ), np.max(y).item()
            ann['area'] = (x1 - x0) * (y1 - y0)
            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
            anns.append(ann)
    return anns


def get_pose3d_res(results, im_id):
    anns = []
    preds = results['pose3d']
    for idx in range(im_id.shape[0]):
        image_id = im_id[idx].item()
        pose3d = preds[idx]
        ann = {
            'image_id': image_id,
            'category_id': 1,  # XXX hard code
            'pose3d': pose3d.tolist(),
            'score': float(1.)
        }
        anns.append(ann)
    return anns


================================================
FILE: ppdet/metrics/keypoint_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import json
from collections import defaultdict, OrderedDict
import numpy as np
import paddle
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe
from scipy.io import loadmat, savemat
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval',
    'KeyPointTopDownMPIIEval'
]


class KeyPointTopDownCOCOEval(object):
    """refer to
        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
        Copyright (c) Microsoft, under the MIT License.
    """

    def __init__(self,
                 anno_file,
                 num_samples,
                 num_joints,
                 output_eval,
                 iou_type='keypoints',
                 in_vis_thre=0.2,
                 oks_thre=0.9,
                 save_prediction_only=False):
        super(KeyPointTopDownCOCOEval, self).__init__()
        self.coco = COCO(anno_file)
        self.num_samples = num_samples
        self.num_joints = num_joints
        self.iou_type = iou_type
        self.in_vis_thre = in_vis_thre
        self.oks_thre = oks_thre
        self.output_eval = output_eval
        self.res_file = os.path.join(output_eval, "keypoints_results.json")
        self.save_prediction_only = save_prediction_only
        self.reset()

    def reset(self):
        self.results = {
            'all_preds': np.zeros(
                (self.num_samples, self.num_joints, 3), dtype=np.float32),
            'all_boxes': np.zeros((self.num_samples, 6)),
            'image_path': []
        }
        self.eval_results = {}
        self.idx = 0

    def update(self, inputs, outputs):
        kpts, _ = outputs['keypoint'][0]

        num_images = inputs['image'].shape[0]
        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
                                  3] = kpts[:, :, 0:3]
        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
            'center'].numpy()[:, 0:2] if isinstance(
                inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
            'scale'].numpy()[:, 0:2] if isinstance(
                inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
            inputs['scale'].numpy() * 200,
            1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
                inputs['scale'] * 200, 1)
        self.results['all_boxes'][
            self.idx:self.idx + num_images,
            5] = np.squeeze(inputs['score'].numpy()) if isinstance(
                inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
        if isinstance(inputs['im_id'], paddle.Tensor):
            self.results['image_path'].extend(inputs['im_id'].numpy())
        else:
            self.results['image_path'].extend(inputs['im_id'])
        self.idx += num_images

    def _write_coco_keypoint_results(self, keypoints):
        data_pack = [{
            'cat_id': 1,
            'cls': 'person',
            'ann_type': 'keypoints',
            'keypoints': keypoints
        }]
        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
        if not os.path.exists(self.output_eval):
            os.makedirs(self.output_eval)
        with open(self.res_file, 'w') as f:
            json.dump(results, f, sort_keys=True, indent=4)
            logger.info(f'The keypoint result is saved to {self.res_file}.')
        try:
            json.load(open(self.res_file))
        except Exception:
            content = []
            with open(self.res_file, 'r') as f:
                for line in f:
                    content.append(line)
            content[-1] = ']'
            with open(self.res_file, 'w') as f:
                for c in content:
                    f.write(c)

    def _coco_keypoint_results_one_category_kernel(self, data_pack):
        cat_id = data_pack['cat_id']
        keypoints = data_pack['keypoints']
        cat_results = []

        for img_kpts in keypoints:
            if len(img_kpts) == 0:
                continue

            _key_points = np.array(
                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
            _key_points = _key_points.reshape(_key_points.shape[0], -1)

            result = [{
                'image_id': img_kpts[k]['image'],
                'category_id': cat_id,
                'keypoints': _key_points[k].tolist(),
                'score': img_kpts[k]['score'],
                'center': list(img_kpts[k]['center']),
                'scale': list(img_kpts[k]['scale'])
            } for k in range(len(img_kpts))]
            cat_results.extend(result)

        return cat_results

    def get_final_results(self, preds, all_boxes, img_path):
        _kpts = []
        for idx, kpt in enumerate(preds):
            _kpts.append({
                'keypoints': kpt,
                'center': all_boxes[idx][0:2],
                'scale': all_boxes[idx][2:4],
                'area': all_boxes[idx][4],
                'score': all_boxes[idx][5],
                'image': int(img_path[idx])
            })
        # image x person x (keypoints)
        kpts = defaultdict(list)
        for kpt in _kpts:
            kpts[kpt['image']].append(kpt)

        # rescoring and oks nms
        num_joints = preds.shape[1]
        in_vis_thre = self.in_vis_thre
        oks_thre = self.oks_thre
        oks_nmsed_kpts = []
        for img in kpts.keys():
            img_kpts = kpts[img]
            for n_p in img_kpts:
                box_score = n_p['score']
                kpt_score = 0
                valid_num = 0
                for n_jt in range(0, num_joints):
                    t_s = n_p['keypoints'][n_jt][2]
                    if t_s > in_vis_thre:
                        kpt_score = kpt_score + t_s
                        valid_num = valid_num + 1
                if valid_num != 0:
                    kpt_score = kpt_score / valid_num
                # rescoring
                n_p['score'] = kpt_score * box_score

            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
                           oks_thre)

            if len(keep) == 0:
                oks_nmsed_kpts.append(img_kpts)
            else:
                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])

        self._write_coco_keypoint_results(oks_nmsed_kpts)

    def accumulate(self):
        self.get_final_results(self.results['all_preds'],
                               self.results['all_boxes'],
                               self.results['image_path'])
        if self.save_prediction_only:
            logger.info(f'The keypoint result is saved to {self.res_file} '
                        'and do not evaluate the mAP.')
            return
        coco_dt = self.coco.loadRes(self.res_file)
        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
        coco_eval.params.useSegm = None
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

        keypoint_stats = []
        for ind in range(len(coco_eval.stats)):
            keypoint_stats.append((coco_eval.stats[ind]))
        self.eval_results['keypoint'] = keypoint_stats

    def log(self):
        if self.save_prediction_only:
            return
        stats_names = [
            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
            'AR .75', 'AR (M)', 'AR (L)'
        ]
        num_values = len(stats_names)
        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
        print('|---' * (num_values + 1) + '|')

        print(' '.join([
            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
        ]) + ' |')

    def get_results(self):
        return self.eval_results


class KeyPointTopDownCOCOWholeBadyHandEval(object):
    def __init__(self,
                 anno_file,
                 num_samples,
                 num_joints,
                 output_eval,
                 save_prediction_only=False):
        super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__()
        self.coco = COCO(anno_file)
        self.num_samples = num_samples
        self.num_joints = num_joints
        self.output_eval = output_eval
        self.res_file = os.path.join(output_eval, "keypoints_results.json")
        self.save_prediction_only = save_prediction_only
        self.parse_dataset()
        self.reset()

    def parse_dataset(self):
        gt_db = []
        num_joints = self.num_joints
        coco = self.coco
        img_ids = coco.getImgIds()
        for img_id in img_ids:
            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
            objs = coco.loadAnns(ann_ids)

            for obj in objs:
                for type in ['left', 'right']:
                    if (obj[f'{type}hand_valid'] and
                            max(obj[f'{type}hand_kpts']) > 0):

                        joints = np.zeros((num_joints, 3), dtype=np.float32)
                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)

                        keypoints = np.array(obj[f'{type}hand_kpts'])
                        keypoints = keypoints.reshape(-1, 3)
                        joints[:, :2] = keypoints[:, :2]
                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])

                        gt_db.append({
                            'bbox': obj[f'{type}hand_box'],
                            'gt_joints': joints,
                            'joints_vis': joints_vis,
                        })
        self.db = gt_db

    def reset(self):
        self.results = {
            'preds': np.zeros(
                (self.num_samples, self.num_joints, 3), dtype=np.float32),
        }
        self.eval_results = {}
        self.idx = 0

    def update(self, inputs, outputs):
        kpts, _ = outputs['keypoint'][0]
        num_images = inputs['image'].shape[0]
        self.results['preds'][self.idx:self.idx + num_images, :, 0:
                              3] = kpts[:, :, 0:3]
        self.idx += num_images

    def accumulate(self):
        self.get_final_results(self.results['preds'])
        if self.save_prediction_only:
            logger.info(f'The keypoint result is saved to {self.res_file} '
                        'and do not evaluate the mAP.')
            return

        self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE'))

    def get_final_results(self, preds):
        kpts = []
        for idx, kpt in enumerate(preds):
            kpts.append({'keypoints': kpt.tolist()})

        self._write_keypoint_results(kpts)

    def _write_keypoint_results(self, keypoints):
        if not os.path.exists(self.output_eval):
            os.makedirs(self.output_eval)
        with open(self.res_file, 'w') as f:
            json.dump(keypoints, f, sort_keys=True, indent=4)
            logger.info(f'The keypoint result is saved to {self.res_file}.')
        try:
            json.load(open(self.res_file))
        except Exception:
            content = []
            with open(self.res_file, 'r') as f:
                for line in f:
                    content.append(line)
            content[-1] = ']'
            with open(self.res_file, 'w') as f:
                for c in content:
                    f.write(c)

    def log(self):
        if self.save_prediction_only:
            return
        for item, value in self.eval_results.items():
            print("{} : {}".format(item, value))

    def get_results(self):
        return self.eval_results

    def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30):
        """Keypoint evaluation.

        Args:
            res_file (str): Json file stored prediction results.
            metrics (str | list[str]): Metric to be performed.
                Options: 'PCK', 'AUC', 'EPE'.
            pck_thr (float): PCK threshold, default as 0.2.
            auc_nor (float): AUC normalization factor, default as 30 pixel.

        Returns:
            List: Evaluation results for evaluation metric.
        """
        info_str = []

        with open(res_file, 'r') as fin:
            preds = json.load(fin)
        assert len(preds) == len(self.db)

        outputs = []
        gts = []
        masks = []
        threshold_bbox = []

        for pred, item in zip(preds, self.db):
            outputs.append(np.array(pred['keypoints'])[:, :-1])
            gts.append(np.array(item['gt_joints'])[:, :-1])
            masks.append((np.array(item['joints_vis'])[:, 0]) > 0)
            if 'PCK' in metrics:
                bbox = np.array(item['bbox'])
                bbox_thr = np.max(bbox[2:])
                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))

        outputs = np.array(outputs)
        gts = np.array(gts)
        masks = np.array(masks)
        threshold_bbox = np.array(threshold_bbox)

        if 'PCK' in metrics:
            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
                                              threshold_bbox)
            info_str.append(('PCK', pck))

        if 'AUC' in metrics:
            info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor)))

        if 'EPE' in metrics:
            info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))

        name_value = OrderedDict(info_str)

        return name_value


class KeyPointTopDownMPIIEval(object):
    def __init__(self,
                 anno_file,
                 num_samples,
                 num_joints,
                 output_eval,
                 oks_thre=0.9,
                 save_prediction_only=False):
        super(KeyPointTopDownMPIIEval, self).__init__()
        self.ann_file = anno_file
        self.res_file = os.path.join(output_eval, "keypoints_results.json")
        self.save_prediction_only = save_prediction_only
        self.reset()

    def reset(self):
        self.results = []
        self.eval_results = {}
        self.idx = 0

    def update(self, inputs, outputs):
        kpts, _ = outputs['keypoint'][0]

        num_images = inputs['image'].shape[0]
        results = {}
        results['preds'] = kpts[:, :, 0:3]
        results['boxes'] = np.zeros((num_images, 6))
        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
        results['image_path'] = inputs['image_file']

        self.results.append(results)

    def accumulate(self):
        self._mpii_keypoint_results_save()
        if self.save_prediction_only:
            logger.info(f'The keypoint result is saved to {self.res_file} '
                        'and do not evaluate the mAP.')
            return

        self.eval_results = self.evaluate(self.results)

    def _mpii_keypoint_results_save(self):
        results = []
        for res in self.results:
            if len(res) == 0:
                continue
            result = [{
                'preds': res['preds'][k].tolist(),
                'boxes': res['boxes'][k].tolist(),
                'image_path': res['image_path'][k],
            } for k in range(len(res))]
            results.extend(result)
        with open(self.res_file, 'w') as f:
            json.dump(results, f, sort_keys=True, indent=4)
            logger.info(f'The keypoint result is saved to {self.res_file}.')

    def log(self):
        if self.save_prediction_only:
            return
        for item, value in self.eval_results.items():
            print("{} : {}".format(item, value))

    def get_results(self):
        return self.eval_results

    def evaluate(self, outputs, savepath=None):
        """Evaluate PCKh for MPII dataset. refer to
        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
        Copyright (c) Microsoft, under the MIT License.

        Args:
            outputs(list(preds, boxes)):

                * preds (np.ndarray[N,K,3]): The first two dimensions are
                  coordinates, score is the third dimension of the array.
                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
                  , scale[1],area, score]

        Returns:
            dict: PCKh for each joint
        """

        kpts = []
        for output in outputs:
            preds = output['preds']
            batch_size = preds.shape[0]
            for i in range(batch_size):
                kpts.append({'keypoints': preds[i]})

        preds = np.stack([kpt['keypoints'] for kpt in kpts])

        # convert 0-based index to 1-based index,
        # and get the first two dimensions.
        preds = preds[..., :2] + 1.0

        if savepath is not None:
            pred_file = os.path.join(savepath, 'pred.mat')
            savemat(pred_file, mdict={'preds': preds})

        SC_BIAS = 0.6
        threshold = 0.5

        gt_file = os.path.join(
            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
        gt_dict = loadmat(gt_file)
        dataset_joints = gt_dict['dataset_joints']
        jnt_missing = gt_dict['jnt_missing']
        pos_gt_src = gt_dict['pos_gt_src']
        headboxes_src = gt_dict['headboxes_src']

        pos_pred_src = np.transpose(preds, [1, 2, 0])

        head = np.where(dataset_joints == 'head')[1][0]
        lsho = np.where(dataset_joints == 'lsho')[1][0]
        lelb = np.where(dataset_joints == 'lelb')[1][0]
        lwri = np.where(dataset_joints == 'lwri')[1][0]
        lhip = np.where(dataset_joints == 'lhip')[1][0]
        lkne = np.where(dataset_joints == 'lkne')[1][0]
        lank = np.where(dataset_joints == 'lank')[1][0]

        rsho = np.where(dataset_joints == 'rsho')[1][0]
        relb = np.where(dataset_joints == 'relb')[1][0]
        rwri = np.where(dataset_joints == 'rwri')[1][0]
        rkne = np.where(dataset_joints == 'rkne')[1][0]
        rank = np.where(dataset_joints == 'rank')[1][0]
        rhip = np.where(dataset_joints == 'rhip')[1][0]

        jnt_visible = 1 - jnt_missing
        uv_error = pos_pred_src - pos_gt_src
        uv_err = np.linalg.norm(uv_error, axis=1)
        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
        headsizes = np.linalg.norm(headsizes, axis=0)
        headsizes *= SC_BIAS
        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
        scaled_uv_err = uv_err / scale
        scaled_uv_err = scaled_uv_err * jnt_visible
        jnt_count = np.sum(jnt_visible, axis=1)
        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count

        # save
        rng = np.arange(0, 0.5 + 0.01, 0.01)
        pckAll = np.zeros((len(rng), 16), dtype=np.float32)

        for r, threshold in enumerate(rng):
            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
            pckAll[r, :] = 100. * np.sum(less_than_threshold,
                                         axis=1) / jnt_count

        PCKh = np.ma.array(PCKh, mask=False)
        PCKh.mask[6:8] = True

        jnt_count = np.ma.array(jnt_count, mask=False)
        jnt_count.mask[6:8] = True
        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)

        name_value = [  #noqa
            ('Head', PCKh[head]),
            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
            ('PCKh', np.sum(PCKh * jnt_ratio)),
            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
        ]
        name_value = OrderedDict(name_value)

        return name_value

    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
        """sort kpts and remove the repeated ones."""
        kpts = sorted(kpts, key=lambda x: x[key])
        num = len(kpts)
        for i in range(num - 1, 0, -1):
            if kpts[i][key] == kpts[i - 1][key]:
                del kpts[i]

        return kpts


================================================
FILE: ppdet/metrics/lvis_utils.py
================================================
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import numpy as np
import itertools

from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
from ppdet.metrics.map_utils import draw_pr_curve

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

def lvisapi_eval(jsonfile,
                 style,
                 lvis_gt=None,
                 anno_file=None,
                 max_dets=(100, 300, 1000),
                 classwise=False,
                 sigmas=None,
                 use_area=True):
    """
    Args:
        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
        coco_gt (str): Whether to load COCOAPI through anno_file,
                 eg: coco_gt = COCO(anno_file)
        anno_file (str): COCO annotations file.
        max_dets (tuple): COCO evaluation maxDets.
        classwise (bool): Whether per-category AP and draw P-R Curve or not.
        sigmas (nparray): keypoint labelling sigmas.
        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
                         do not have 'area', please set use_area=False.
    """
    assert lvis_gt != None or anno_file != None
    from lvis import LVIS, LVISEval, LVISResults

    if lvis_gt == None:
        # coco_gt = COCO(anno_file)
        lvis_gt = LVIS(anno_file)

    logger.info("Start evaluate...")
    lvis_dt = LVISResults(lvis_gt, jsonfile)
    
    lvis_eval = LVISEval(lvis_gt, lvis_dt, style)
    lvis_eval.evaluate()
    lvis_eval.accumulate()
    lvis_eval.summarize()
    if classwise:
        # Compute per-category AP and PR curve
        try:
            from terminaltables import AsciiTable
        except Exception as e:
            logger.error(
                'terminaltables not found, plaese install terminaltables. '
                'for example: `pip install terminaltables`.')
            raise e
        precisions = coco_eval.eval['precision']
        cat_ids = coco_gt.getCatIds()
        # precision: (iou, recall, cls, area range, max dets)
        assert len(cat_ids) == precisions.shape[2]
        results_per_category = []
        for idx, catId in enumerate(cat_ids):
            # area range index 0: all area ranges
            # max dets index -1: typically 100 per image
            nm = coco_gt.loadCats(catId)[0]
            precision = precisions[:, :, idx, 0, -1]
            precision = precision[precision > -1]
            if precision.size:
                ap = np.mean(precision)
            else:
                ap = float('nan')
            results_per_category.append(
                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
            pr_array = precisions[0, :, idx, 0, 2]
            recall_array = np.arange(0.0, 1.01, 0.01)
            draw_pr_curve(
                pr_array,
                recall_array,
                out_dir=style + '_pr_curve',
                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))

        num_columns = min(6, len(results_per_category) * 2)
        results_flatten = list(itertools.chain(*results_per_category))
        headers = ['category', 'AP'] * (num_columns // 2)
        results_2d = itertools.zip_longest(
            * [results_flatten[i::num_columns] for i in range(num_columns)])
        table_data = [headers]
        table_data += [result for result in results_2d]
        table = AsciiTable(table_data)
        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
        logger.info("per-category PR curve has output to {} folder.".format(
            style + '_pr_curve'))
    # flush coco evaluation result
    sys.stdout.flush()
    
    return lvis_eval.get_results()

================================================
FILE: ppdet/metrics/map_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import sys
import numpy as np
import itertools
import paddle
from ppdet.modeling.rbox_utils import poly2rbox_np

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'draw_pr_curve',
    'bbox_area',
    'jaccard_overlap',
    'prune_zero_padding',
    'DetectionMAP',
    'ap_per_class',
    'compute_ap',
]


def draw_pr_curve(precision,
                  recall,
                  iou=0.5,
                  out_dir='pr_curve',
                  file_name='precision_recall_curve.jpg'):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_path = os.path.join(out_dir, file_name)
    try:
        import matplotlib.pyplot as plt
    except Exception as e:
        logger.error('Matplotlib not found, plaese install matplotlib.'
                     'for example: `pip install matplotlib`.')
        raise e
    plt.cla()
    plt.figure('P-R Curve')
    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.grid(True)
    plt.plot(recall, precision)
    plt.savefig(output_path)


def bbox_area(bbox, is_bbox_normalized):
    """
    Calculate area of a bounding box
    """
    norm = 1. - float(is_bbox_normalized)
    width = bbox[2] - bbox[0] + norm
    height = bbox[3] - bbox[1] + norm
    return width * height


def jaccard_overlap(pred, gt, is_bbox_normalized=False):
    """
    Calculate jaccard overlap ratio between two bounding box
    """
    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
        pred[1] >= gt[3] or pred[3] <= gt[1]:
        return 0.
    inter_xmin = max(pred[0], gt[0])
    inter_ymin = max(pred[1], gt[1])
    inter_xmax = min(pred[2], gt[2])
    inter_ymax = min(pred[3], gt[3])
    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
                           is_bbox_normalized)
    pred_size = bbox_area(pred, is_bbox_normalized)
    gt_size = bbox_area(gt, is_bbox_normalized)
    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
    return overlap


def calc_rbox_iou(pred, gt_poly):
    """
    calc iou between rotated bbox
    """
    # calc iou of bounding box for speedup
    pred = np.array(pred, np.float32).reshape(-1, 2)
    gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2)
    pred_rect = [
        np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),
        np.max(pred[:, 1])
    ]
    gt_rect = [
        np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]),
        np.max(gt_poly[:, 1])
    ]
    iou = jaccard_overlap(pred_rect, gt_rect, False)

    if iou <= 0:
        return iou

    # calc rbox iou
    pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5)
    gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5)
    try:
        from ext_op import rbox_iou
    except Exception as e:
        print("import custom_ops error, try install ext_op " \
                  "following ppdet/ext_op/README.md", e)
        sys.stdout.flush()
        sys.exit(-1)
    pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')
    pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')
    iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)
    iou = iou.numpy()
    return iou[0][0]


def prune_zero_padding(gt_box, gt_label, difficult=None):
    valid_cnt = 0
    for i in range(len(gt_box)):
        if (gt_box[i] == 0).all():
            break
        valid_cnt += 1
    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
            if difficult is not None else None)


class DetectionMAP(object):
    """
    Calculate detection mean average precision.
    Currently support two types: 11point and integral

    Args:
        class_num (int): The class number.
        overlap_thresh (float): The threshold of overlap
            ratio between prediction bounding box and 
            ground truth bounding box for deciding 
            true/false positive. Default 0.5.
        map_type (str): Calculation method of mean average
            precision, currently support '11point' and
            'integral'. Default '11point'.
        is_bbox_normalized (bool): Whether bounding boxes
            is normalized to range[0, 1]. Default False.
        evaluate_difficult (bool): Whether to evaluate
            difficult bounding boxes. Default False.
        catid2name (dict): Mapping between category id and category name.
        classwise (bool): Whether per-category AP and draw
            P-R Curve or not.
    """

    def __init__(self,
                 class_num,
                 overlap_thresh=0.5,
                 map_type='11point',
                 is_bbox_normalized=False,
                 evaluate_difficult=False,
                 catid2name=None,
                 classwise=False):
        self.class_num = class_num
        self.overlap_thresh = overlap_thresh
        assert map_type in ['11point', 'integral'], \
                "map_type currently only support '11point' "\
                "and 'integral'"
        self.map_type = map_type
        self.is_bbox_normalized = is_bbox_normalized
        self.evaluate_difficult = evaluate_difficult
        self.classwise = classwise
        self.classes = []
        for cname in catid2name.values():
            self.classes.append(cname)
        self.reset()

    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
        """
        Update metric statics from given prediction and ground
        truth infomations.
        """
        if difficult is None:
            difficult = np.zeros_like(gt_label)

        # record class gt count
        for gtl, diff in zip(gt_label, difficult):
            if self.evaluate_difficult or int(diff) == 0:
                self.class_gt_counts[int(np.array(gtl))] += 1

        # record class score positive
        visited = [False] * len(gt_label)
        for b, s, l in zip(bbox, score, label):
            pred = b.tolist() if isinstance(b, np.ndarray) else b
            max_idx = -1
            max_overlap = -1.0
            for i, gl in enumerate(gt_label):
                if int(gl) == int(l):
                    if len(gt_box[i]) == 8:
                        overlap = calc_rbox_iou(pred, gt_box[i])
                    else:
                        overlap = jaccard_overlap(pred, gt_box[i],
                                                  self.is_bbox_normalized)
                    if overlap > max_overlap:
                        max_overlap = overlap
                        max_idx = i

            if max_overlap > self.overlap_thresh:
                if self.evaluate_difficult or \
                        int(np.array(difficult[max_idx])) == 0:
                    if not visited[max_idx]:
                        self.class_score_poss[int(l)].append([s, 1.0])
                        visited[max_idx] = True
                    else:
                        self.class_score_poss[int(l)].append([s, 0.0])
            else:
                self.class_score_poss[int(l)].append([s, 0.0])

    def reset(self):
        """
        Reset metric statics
        """
        self.class_score_poss = [[] for _ in range(self.class_num)]
        self.class_gt_counts = [0] * self.class_num
        self.mAP = 0.0

    def accumulate(self):
        """
        Accumulate metric results and calculate mAP
        """
        mAP = 0.
        valid_cnt = 0
        eval_results = []
        for score_pos, count in zip(self.class_score_poss,
                                    self.class_gt_counts):
            if count == 0: continue
            if len(score_pos) == 0:
                valid_cnt += 1
                continue

            accum_tp_list, accum_fp_list = \
                    self._get_tp_fp_accum(score_pos)
            precision = []
            recall = []
            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
                precision.append(float(ac_tp) / (ac_tp + ac_fp))
                recall.append(float(ac_tp) / count)

            one_class_ap = 0.0
            if self.map_type == '11point':
                max_precisions = [0.] * 11
                start_idx = len(precision) - 1
                for j in range(10, -1, -1):
                    for i in range(start_idx, -1, -1):
                        if recall[i] < float(j) / 10.:
                            start_idx = i
                            if j > 0:
                                max_precisions[j - 1] = max_precisions[j]
                                break
                        else:
                            if max_precisions[j] < precision[i]:
                                max_precisions[j] = precision[i]
                one_class_ap = sum(max_precisions) / 11.
                mAP += one_class_ap
                valid_cnt += 1
            elif self.map_type == 'integral':
                import math
                prev_recall = 0.
                for i in range(len(precision)):
                    recall_gap = math.fabs(recall[i] - prev_recall)
                    if recall_gap > 1e-6:
                        one_class_ap += precision[i] * recall_gap
                        prev_recall = recall[i]
                mAP += one_class_ap
                valid_cnt += 1
            else:
                logger.error("Unspported mAP type {}".format(self.map_type))
                sys.exit(1)
            eval_results.append({
                'class': self.classes[valid_cnt - 1],
                'ap': one_class_ap,
                'precision': precision,
                'recall': recall,
            })
        self.eval_results = eval_results
        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP

    def get_map(self):
        """
        Get mAP result
        """
        if self.mAP is None:
            logger.error("mAP is not calculated.")
        if self.classwise:
            # Compute per-category AP and PR curve
            try:
                from terminaltables import AsciiTable
            except Exception as e:
                logger.error(
                    'terminaltables not found, plaese install terminaltables. '
                    'for example: `pip install terminaltables`.')
                raise e
            results_per_category = []
            for eval_result in self.eval_results:
                results_per_category.append(
                    (str(eval_result['class']),
                     '{:0.3f}'.format(float(eval_result['ap']))))
                draw_pr_curve(
                    eval_result['precision'],
                    eval_result['recall'],
                    out_dir='voc_pr_curve',
                    file_name='{}_precision_recall_curve.jpg'.format(
                        eval_result['class']))

            num_columns = min(6, len(results_per_category) * 2)
            results_flatten = list(itertools.chain(*results_per_category))
            headers = ['category', 'AP'] * (num_columns // 2)
            results_2d = itertools.zip_longest(* [
                results_flatten[i::num_columns] for i in range(num_columns)
            ])
            table_data = [headers]
            table_data += [result for result in results_2d]
            table = AsciiTable(table_data)
            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
            logger.info(
                "per-category PR curve has output to voc_pr_curve folder.")
        return self.mAP

    def _get_tp_fp_accum(self, score_pos_list):
        """
        Calculate accumulating true/false positive results from
        [score, pos] records
        """
        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
        accum_tp = 0
        accum_fp = 0
        accum_tp_list = []
        accum_fp_list = []
        for (score, pos) in sorted_list:
            accum_tp += int(pos)
            accum_tp_list.append(accum_tp)
            accum_fp += 1 - int(pos)
            accum_fp_list.append(accum_fp)
        return accum_tp_list, accum_fp_list


def ap_per_class(tp, conf, pred_cls, target_cls):
    """
    Computes the average precision, given the recall and precision curves.
    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
    
    Args:
        tp (list): True positives.
        conf (list): Objectness value from 0-1.
        pred_cls (list): Predicted object classes.
        target_cls (list): Target object classes.
    """
    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
        pred_cls), np.array(target_cls)

    # Sort by objectness
    i = np.argsort(-conf)
    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]

    # Find unique classes
    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))

    # Create Precision-Recall curve and compute AP for each class
    ap, p, r = [], [], []
    for c in unique_classes:
        i = pred_cls == c
        n_gt = sum(target_cls == c)  # Number of ground truth objects
        n_p = sum(i)  # Number of predicted objects

        if (n_p == 0) and (n_gt == 0):
            continue
        elif (n_p == 0) or (n_gt == 0):
            ap.append(0)
            r.append(0)
            p.append(0)
        else:
            # Accumulate FPs and TPs
            fpc = np.cumsum(1 - tp[i])
            tpc = np.cumsum(tp[i])

            # Recall
            recall_curve = tpc / (n_gt + 1e-16)
            r.append(tpc[-1] / (n_gt + 1e-16))

            # Precision
            precision_curve = tpc / (tpc + fpc)
            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))

            # AP from recall-precision curve
            ap.append(compute_ap(recall_curve, precision_curve))

    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
        p)


def compute_ap(recall, precision):
    """
    Computes the average precision, given the recall and precision curves.
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
    
    Args:
        recall (list): The recall curve.
        precision (list): The precision curve.

    Returns:
        The average precision as computed in py-faster-rcnn.
    """
    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], recall, [1.]))
    mpre = np.concatenate(([0.], precision, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap


================================================
FILE: ppdet/metrics/mcmot_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import copy
import sys
import math
from collections import defaultdict

import numpy as np
import pandas as pd

from .metrics import Metric
try:
    import motmetrics as mm
    from motmetrics.math_util import quiet_divide
    metrics = mm.metrics.motchallenge_metrics
    mh = mm.metrics.create()
except:
    print(
        'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
    )
    pass
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['MCMOTEvaluator', 'MCMOTMetric']

METRICS_LIST = [
    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
]

NAME_MAP = {
    'num_frames': 'num_frames',
    'num_matches': 'num_matches',
    'num_switches': 'IDs',
    'num_transfer': 'IDt',
    'num_ascend': 'IDa',
    'num_migrate': 'IDm',
    'num_false_positives': 'FP',
    'num_misses': 'FN',
    'num_detections': 'num_detections',
    'num_objects': 'num_objects',
    'num_predictions': 'num_predictions',
    'num_unique_objects': 'GT',
    'mostly_tracked': 'MT',
    'partially_tracked': 'partially_tracked',
    'mostly_lost': 'ML',
    'num_fragmentations': 'FM',
    'motp': 'MOTP',
    'mota': 'MOTA',
    'precision': 'Prcn',
    'recall': 'Rcll',
    'idfp': 'idfp',
    'idfn': 'idfn',
    'idtp': 'idtp',
    'idp': 'IDP',
    'idr': 'IDR',
    'idf1': 'IDF1'
}


def parse_accs_metrics(seq_acc, index_name, verbose=False):
    """
    Parse the evaluation indicators of multiple MOTAccumulator 
    """
    mh = mm.metrics.create()
    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
                                     summary.loc['OVERALL', 'num_detections']
    if verbose:
        strsummary = mm.io.render_summary(
            summary, formatters=mh.formatters, namemap=NAME_MAP)
        print(strsummary)

    return summary


def seqs_overall_metrics(summary_df, verbose=False):
    """
    Calculate overall metrics for multiple sequences
    """
    add_col = [
        'num_frames', 'num_matches', 'num_switches', 'num_transfer',
        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
        'num_detections', 'num_objects', 'num_predictions',
        'num_unique_objects', 'mostly_tracked', 'partially_tracked',
        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
    ]
    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
    calc_df = summary_df.copy()

    overall_dic = {}
    for col in add_col:
        overall_dic[col] = calc_df[col].sum()

    for col in calc_col:
        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
            calc_df, overall_dic)

    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
    calc_df = pd.concat([calc_df, overall_df])

    if verbose:
        mh = mm.metrics.create()
        str_calc_df = mm.io.render_summary(
            calc_df, formatters=mh.formatters, namemap=NAME_MAP)
        print(str_calc_df)

    return calc_df


class MCMOTMetricOverall(object):
    def motp_overall(summary_df, overall_dic):
        motp = quiet_divide((summary_df['motp'] *
                             summary_df['num_detections']).sum(),
                            overall_dic['num_detections'])
        return motp

    def mota_overall(summary_df, overall_dic):
        del summary_df
        mota = 1. - quiet_divide(
            (overall_dic['num_misses'] + overall_dic['num_switches'] +
             overall_dic['num_false_positives']), overall_dic['num_objects'])
        return mota

    def precision_overall(summary_df, overall_dic):
        del summary_df
        precision = quiet_divide(overall_dic['num_detections'], (
            overall_dic['num_false_positives'] + overall_dic['num_detections']))
        return precision

    def recall_overall(summary_df, overall_dic):
        del summary_df
        recall = quiet_divide(overall_dic['num_detections'],
                              overall_dic['num_objects'])
        return recall

    def idp_overall(summary_df, overall_dic):
        del summary_df
        idp = quiet_divide(overall_dic['idtp'],
                           (overall_dic['idtp'] + overall_dic['idfp']))
        return idp

    def idr_overall(summary_df, overall_dic):
        del summary_df
        idr = quiet_divide(overall_dic['idtp'],
                           (overall_dic['idtp'] + overall_dic['idfn']))
        return idr

    def idf1_overall(summary_df, overall_dic):
        del summary_df
        idf1 = quiet_divide(2. * overall_dic['idtp'], (
            overall_dic['num_objects'] + overall_dic['num_predictions']))
        return idf1


def read_mcmot_results_union(filename, is_gt, is_ignore):
    results_dict = dict()
    if os.path.isfile(filename):
        all_result = np.loadtxt(filename, delimiter=',')
        if all_result.shape[0] == 0 or all_result.shape[1] < 7:
            return results_dict
        if is_ignore:
            return results_dict
        if is_gt:
            # only for test use
            all_result = all_result[all_result[:, 7] != 0]
            all_result[:, 7] = all_result[:, 7] - 1

        if all_result.shape[0] == 0:
            return results_dict

        class_unique = np.unique(all_result[:, 7])

        last_max_id = 0
        result_cls_list = []
        for cls in class_unique:
            result_cls_split = all_result[all_result[:, 7] == cls]
            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
            # make sure track id different between every category
            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
            result_cls_list.append(result_cls_split)

        results_con = np.concatenate(result_cls_list)

        for line in range(len(results_con)):
            linelist = results_con[line]
            fid = int(linelist[0])
            if fid < 1:
                continue
            results_dict.setdefault(fid, list())

            if is_gt:
                score = 1
            else:
                score = float(linelist[6])

            tlwh = tuple(map(float, linelist[2:6]))
            target_id = int(linelist[1])
            cls = int(linelist[7])

            results_dict[fid].append((tlwh, target_id, cls, score))

        return results_dict


def read_mcmot_results(filename, is_gt, is_ignore):
    results_dict = dict()
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            for line in f.readlines():
                linelist = line.strip().split(',')
                if len(linelist) < 7:
                    continue
                fid = int(linelist[0])
                if fid < 1:
                    continue
                cid = int(linelist[7])
                if is_gt:
                    score = 1
                    # only for test use
                    cid -= 1
                else:
                    score = float(linelist[6])

                cls_result_dict = results_dict.setdefault(cid, dict())
                cls_result_dict.setdefault(fid, list())

                tlwh = tuple(map(float, linelist[2:6]))
                target_id = int(linelist[1])
                cls_result_dict[fid].append((tlwh, target_id, score))
    return results_dict


def read_results(filename,
                 data_type,
                 is_gt=False,
                 is_ignore=False,
                 multi_class=False,
                 union=False):
    if data_type in ['mcmot', 'lab']:
        if multi_class:
            if union:
                # The results are evaluated by union all the categories.
                # Track IDs between different categories cannot be duplicate.
                read_fun = read_mcmot_results_union
            else:
                # The results are evaluated separately by category.
                read_fun = read_mcmot_results
        else:
            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
                             format(multi_class))
    else:
        raise ValueError('Unknown data type: {}'.format(data_type))

    return read_fun(filename, is_gt, is_ignore)


def unzip_objs(objs):
    if len(objs) > 0:
        tlwhs, ids, scores = zip(*objs)
    else:
        tlwhs, ids, scores = [], [], []
    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
    return tlwhs, ids, scores


def unzip_objs_cls(objs):
    if len(objs) > 0:
        tlwhs, ids, cls, scores = zip(*objs)
    else:
        tlwhs, ids, cls, scores = [], [], [], []
    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
    ids = np.array(ids)
    cls = np.array(cls)
    scores = np.array(scores)
    return tlwhs, ids, cls, scores


class MCMOTEvaluator(object):
    def __init__(self, data_root, seq_name, data_type, num_classes):
        self.data_root = data_root
        self.seq_name = seq_name
        self.data_type = data_type
        self.num_classes = num_classes

        self.load_annotations()
        try:
            import motmetrics as mm
            mm.lap.default_solver = 'lap'
        except Exception as e:
            raise RuntimeError(
                'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
            )
        self.reset_accumulator()

        self.class_accs = []

    def load_annotations(self):
        assert self.data_type == 'mcmot'
        self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
                                        '{}.txt'.format(self.seq_name))
        if not os.path.exists(self.gt_filename):
            logger.warning(
                "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
            )

    def reset_accumulator(self):
        self.acc = mm.MOTAccumulator(auto_id=True)

    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
        if union:
            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]

            # get distance matrix
            iou_distance = mm.distances.iou_matrix(
                gt_tlwhs, trk_tlwhs, max_iou=0.5)

            # Set the distance between objects of different categories to nan
            gt_cls_len = len(gt_cls)
            trk_cls_len = len(trk_cls)
            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)
            if gt_cls_len != 0 and trk_cls_len != 0:
                gt_cls = gt_cls.reshape(gt_cls_len, 1)
                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
                trk_cls = trk_cls.reshape(1, trk_cls_len)
                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
                iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)

        else:
            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]

            # get distance matrix
            iou_distance = mm.distances.iou_matrix(
                gt_tlwhs, trk_tlwhs, max_iou=0.5)

        self.acc.update(gt_ids, trk_ids, iou_distance)

        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
                                                            'mot_events'):
            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics
        else:
            events = None
        return events

    def eval_file(self, result_filename):
        # evaluation of each category
        gt_frame_dict = read_results(
            self.gt_filename,
            self.data_type,
            is_gt=True,
            multi_class=True,
            union=False)
        result_frame_dict = read_results(
            result_filename,
            self.data_type,
            is_gt=False,
            multi_class=True,
            union=False)

        for cid in range(self.num_classes):
            self.reset_accumulator()
            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())

            # only labeled frames will be evaluated
            frames = sorted(list(set(cls_gt_frame_dict.keys())))

            for frame_id in frames:
                trk_objs = cls_result_frame_dict.get(frame_id, [])
                gt_objs = cls_gt_frame_dict.get(frame_id, [])
                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)

            self.class_accs.append(self.acc)

        return self.class_accs

    @staticmethod
    def get_summary(accs,
                    names,
                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
                             'precision', 'recall')):
        names = copy.deepcopy(names)
        if metrics is None:
            metrics = mm.metrics.motchallenge_metrics
        metrics = copy.deepcopy(metrics)

        mh = mm.metrics.create()
        summary = mh.compute_many(
            accs, metrics=metrics, names=names, generate_overall=True)

        return summary

    @staticmethod
    def save_summary(summary, filename):
        import pandas as pd
        writer = pd.ExcelWriter(filename)
        summary.to_excel(writer)
        writer.save()


class MCMOTMetric(Metric):
    def __init__(self, num_classes, save_summary=False):
        self.num_classes = num_classes
        self.save_summary = save_summary
        self.MCMOTEvaluator = MCMOTEvaluator
        self.result_root = None
        self.reset()

        self.seqs_overall = defaultdict(list)

    def reset(self):
        self.accs = []
        self.seqs = []

    def update(self, data_root, seq, data_type, result_root, result_filename):
        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
                                        self.num_classes)
        seq_acc = evaluator.eval_file(result_filename)
        self.accs.append(seq_acc)
        self.seqs.append(seq)
        self.result_root = result_root

        cls_index_name = [
            '{}_{}'.format(seq, i) for i in range(self.num_classes)
        ]
        summary = parse_accs_metrics(seq_acc, cls_index_name)
        summary.rename(
            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
        for row in range(len(summary)):
            self.seqs_overall[row].append(summary.iloc[row:row + 1])

    def accumulate(self):
        self.cls_summary_list = []
        for row in range(self.num_classes):
            seqs_cls_df = pd.concat(self.seqs_overall[row])
            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
            cls_summary_overall.rename(
                index={'overall_calc': 'overall_calc_{}'.format(row)},
                inplace=True)
            self.cls_summary_list.append(cls_summary_overall)

    def log(self):
        seqs_summary = seqs_overall_metrics(
            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
        class_summary = seqs_overall_metrics(
            pd.concat(self.cls_summary_list), verbose=True)

    def get_results(self):
        return 1


================================================
FILE: ppdet/metrics/metrics.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import json
import paddle
import numpy as np
import typing
from collections import defaultdict
from pathlib import Path

from .map_utils import prune_zero_padding, DetectionMAP
from .coco_utils import get_infer_results, cocoapi_eval
from .lvis_utils import lvisapi_eval
from .widerface_utils import (face_eval_run, image_eval, img_pr_info,
                              dataset_pr_info, voc_ap)
from ppdet.data.source.category import get_categories
from ppdet.modeling.rbox_utils import poly2rbox_np

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',
    'RBoxMetric', 'SNIPERCOCOMetric', 'LVISMetric'
]

COCO_SIGMAS = np.array([
    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,
    .89, .89
]) / 10.0
CROWD_SIGMAS = np.array(
    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
     .79]) / 10.0


class Metric(paddle.metric.Metric):
    def name(self):
        return self.__class__.__name__

    def reset(self):
        pass

    def accumulate(self):
        pass

    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
    # :metch:`reset`, in ppdet, we also need following 2 methods:

    # abstract method for logging metric results
    def log(self):
        pass

    # abstract method for getting metric results
    def get_results(self):
        pass


class COCOMetric(Metric):
    def __init__(self, anno_file, **kwargs):
        self.anno_file = anno_file
        self.clsid2catid = kwargs.get('clsid2catid', None)
        if self.clsid2catid is None:
            self.clsid2catid, _ = get_categories('COCO', anno_file)
        self.classwise = kwargs.get('classwise', False)
        self.output_eval = kwargs.get('output_eval', None)
        # TODO: bias should be unified
        self.bias = kwargs.get('bias', 0)
        self.save_prediction_only = kwargs.get('save_prediction_only', False)
        self.iou_type = kwargs.get('IouType', 'bbox')

        if not self.save_prediction_only:
            assert os.path.isfile(anno_file), \
                    "anno_file {} not a file".format(anno_file)

        if self.output_eval is not None:
            Path(self.output_eval).mkdir(exist_ok=True)

        self.save_threshold = kwargs.get('save_threshold', 0)

        self.reset()

    def reset(self):
        # only bbox and mask evaluation support currently
        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
        self.eval_results = {}

    def update(self, inputs, outputs):
        outs = {}
        # outputs Tensor -> numpy.ndarray
        for k, v in outputs.items():
            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v

        # multi-scale inputs: all inputs have same im_id
        if isinstance(inputs, typing.Sequence):
            im_id = inputs[0]['im_id']
        else:
            im_id = inputs['im_id']
        outs['im_id'] = im_id.numpy() if isinstance(im_id,
                                                    paddle.Tensor) else im_id
        if 'im_file' in inputs:
            outs['im_file'] = inputs['im_file']

        infer_results = get_infer_results(
            outs,
            self.clsid2catid,
            bias=self.bias,
            save_threshold=self.save_threshold)
        self.results['bbox'] += infer_results[
            'bbox'] if 'bbox' in infer_results else []
        self.results['mask'] += infer_results[
            'mask'] if 'mask' in infer_results else []
        self.results['segm'] += infer_results[
            'segm'] if 'segm' in infer_results else []
        self.results['keypoint'] += infer_results[
            'keypoint'] if 'keypoint' in infer_results else []

    def accumulate(self):
        if len(self.results['bbox']) > 0:
            output = "bbox.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['bbox'], f)
                logger.info('The bbox result is saved to bbox.json.')

            if self.save_prediction_only:
                logger.info('The bbox result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                bbox_stats = cocoapi_eval(
                    output,
                    'bbox',
                    anno_file=self.anno_file,
                    classwise=self.classwise)
                self.eval_results['bbox'] = bbox_stats
                sys.stdout.flush()

        if len(self.results['mask']) > 0:
            output = "mask.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['mask'], f)
                logger.info('The mask result is saved to mask.json.')

            if self.save_prediction_only:
                logger.info('The mask result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                seg_stats = cocoapi_eval(
                    output,
                    'segm',
                    anno_file=self.anno_file,
                    classwise=self.classwise)
                self.eval_results['mask'] = seg_stats
                sys.stdout.flush()

        if len(self.results['segm']) > 0:
            output = "segm.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['segm'], f)
                logger.info('The segm result is saved to segm.json.')

            if self.save_prediction_only:
                logger.info('The segm result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                seg_stats = cocoapi_eval(
                    output,
                    'segm',
                    anno_file=self.anno_file,
                    classwise=self.classwise)
                self.eval_results['mask'] = seg_stats
                sys.stdout.flush()

        if len(self.results['keypoint']) > 0:
            output = "keypoint.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['keypoint'], f)
                logger.info('The keypoint result is saved to keypoint.json.')

            if self.save_prediction_only:
                logger.info('The keypoint result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                style = 'keypoints'
                use_area = True
                sigmas = COCO_SIGMAS
                if self.iou_type == 'keypoints_crowd':
                    style = 'keypoints_crowd'
                    use_area = False
                    sigmas = CROWD_SIGMAS
                keypoint_stats = cocoapi_eval(
                    output,
                    style,
                    anno_file=self.anno_file,
                    classwise=self.classwise,
                    sigmas=sigmas,
                    use_area=use_area)
                self.eval_results['keypoint'] = keypoint_stats
                sys.stdout.flush()

    def log(self):
        pass

    def get_results(self):
        return self.eval_results

class LVISMetric(Metric):
    def __init__(self, anno_file, **kwargs):
        self.anno_file = anno_file
        self.clsid2catid = kwargs.get('clsid2catid', None)
        if self.clsid2catid is None:
            self.clsid2catid, _ = get_categories('COCO', anno_file)
        self.classwise = kwargs.get('classwise', False)
        self.output_eval = kwargs.get('output_eval', None)
        # TODO: bias should be unified
        self.bias = kwargs.get('bias', 0)
        self.save_prediction_only = kwargs.get('save_prediction_only', False)
        self.iou_type = kwargs.get('IouType', 'bbox')

        if not self.save_prediction_only:
            assert os.path.isfile(anno_file), \
                    "anno_file {} not a file".format(anno_file)

        if self.output_eval is not None:
            Path(self.output_eval).mkdir(exist_ok=True)

        self.reset()

    def reset(self):
        # only bbox and mask evaluation support currently
        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
        self.eval_results = {}

    def update(self, inputs, outputs):
        outs = {}
        # outputs Tensor -> numpy.ndarray
        for k, v in outputs.items():
            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v

        # multi-scale inputs: all inputs have same im_id
        if isinstance(inputs, typing.Sequence):
            im_id = inputs[0]['im_id']
        else:
            im_id = inputs['im_id']
        outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id

        infer_results = get_infer_results(
            outs, self.clsid2catid, bias=self.bias)
        self.results['bbox'] += infer_results[
            'bbox'] if 'bbox' in infer_results else []
        self.results['mask'] += infer_results[
            'mask'] if 'mask' in infer_results else []
        self.results['segm'] += infer_results[
            'segm'] if 'segm' in infer_results else []
        self.results['keypoint'] += infer_results[
            'keypoint'] if 'keypoint' in infer_results else []

    def accumulate(self):
        if len(self.results['bbox']) > 0:
            output = "bbox.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['bbox'], f)
                logger.info('The bbox result is saved to bbox.json.')

            if self.save_prediction_only:
                logger.info('The bbox result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                bbox_stats = lvisapi_eval(
                    output,
                    'bbox',
                    anno_file=self.anno_file,
                    classwise=self.classwise
                )
                self.eval_results['bbox'] = bbox_stats
                sys.stdout.flush()

        if len(self.results['mask']) > 0:
            output = "mask.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['mask'], f)
                logger.info('The mask result is saved to mask.json.')

            if self.save_prediction_only:
                logger.info('The mask result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                seg_stats = cocoapi_eval(
                    output,
                    'segm',
                    anno_file=self.anno_file,
                    classwise=self.classwise)
                self.eval_results['mask'] = seg_stats
                sys.stdout.flush()

        if len(self.results['segm']) > 0:
            output = "segm.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['segm'], f)
                logger.info('The segm result is saved to segm.json.')

            if self.save_prediction_only:
                logger.info('The segm result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                seg_stats = cocoapi_eval(
                    output,
                    'segm',
                    anno_file=self.anno_file,
                    classwise=self.classwise)
                self.eval_results['mask'] = seg_stats
                sys.stdout.flush()

        if len(self.results['keypoint']) > 0:
            output = "keypoint.json"
            if self.output_eval:
                output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results['keypoint'], f)
                logger.info('The keypoint result is saved to keypoint.json.')

            if self.save_prediction_only:
                logger.info('The keypoint result is saved to {} and do not '
                            'evaluate the mAP.'.format(output))
            else:
                style = 'keypoints'
                use_area = True
                sigmas = COCO_SIGMAS
                if self.iou_type == 'keypoints_crowd':
                    style = 'keypoints_crowd'
                    use_area = False
                    sigmas = CROWD_SIGMAS
                keypoint_stats = cocoapi_eval(
                    output,
                    style,
                    anno_file=self.anno_file,
                    classwise=self.classwise,
                    sigmas=sigmas,
                    use_area=use_area)
                self.eval_results['keypoint'] = keypoint_stats
                sys.stdout.flush()

    def log(self):
        # pass
        logger.info(self.eval_results['bbox'])

    def get_results(self):
        return self.eval_results

class VOCMetric(Metric):
    def __init__(self,
                 label_list,
                 class_num=20,
                 overlap_thresh=0.5,
                 map_type='11point',
                 is_bbox_normalized=False,
                 evaluate_difficult=False,
                 classwise=False,
                 output_eval=None,
                 save_prediction_only=False):
        assert os.path.isfile(label_list), \
                "label_list {} not a file".format(label_list)
        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)

        self.overlap_thresh = overlap_thresh
        self.map_type = map_type
        self.evaluate_difficult = evaluate_difficult
        self.output_eval = output_eval
        self.save_prediction_only = save_prediction_only
        self.detection_map = DetectionMAP(
            class_num=class_num,
            overlap_thresh=overlap_thresh,
            map_type=map_type,
            is_bbox_normalized=is_bbox_normalized,
            evaluate_difficult=evaluate_difficult,
            catid2name=self.catid2name,
            classwise=classwise)

        self.reset()

    def reset(self):
        self.results = {'bbox': [], 'score': [], 'label': []}
        self.detection_map.reset()

    def update(self, inputs, outputs):
        bbox_np = outputs['bbox'].numpy() if isinstance(
            outputs['bbox'], paddle.Tensor) else outputs['bbox']
        bboxes = bbox_np[:, 2:]
        scores = bbox_np[:, 1]
        labels = bbox_np[:, 0]
        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']

        self.results['bbox'].append(bboxes.tolist())
        self.results['score'].append(scores.tolist())
        self.results['label'].append(labels.tolist())

        if bboxes.shape == (1, 1) or bboxes is None:
            return
        if self.save_prediction_only:
            return

        gt_boxes = inputs['gt_bbox']
        gt_labels = inputs['gt_class']
        difficults = inputs['difficult'] if not self.evaluate_difficult \
                            else None

        if 'scale_factor' in inputs:
            scale_factor = inputs['scale_factor'].numpy() if isinstance(
                inputs['scale_factor'],
                paddle.Tensor) else inputs['scale_factor']
        else:
            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')

        bbox_idx = 0
        for i in range(len(gt_boxes)):
            gt_box = gt_boxes[i].numpy() if isinstance(
                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
            h, w = scale_factor[i]
            gt_box = gt_box / np.array([w, h, w, h])
            gt_label = gt_labels[i].numpy() if isinstance(
                gt_labels[i], paddle.Tensor) else gt_labels[i]
            if difficults is not None:
                difficult = difficults[i].numpy() if isinstance(
                    difficults[i], paddle.Tensor) else difficults[i]
            else:
                difficult = None
            bbox_num = bbox_lengths[i]
            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
            score = scores[bbox_idx:bbox_idx + bbox_num]
            label = labels[bbox_idx:bbox_idx + bbox_num]
            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
                                                             difficult)
            self.detection_map.update(bbox, score, label, gt_box, gt_label,
                                      difficult)
            bbox_idx += bbox_num

    def accumulate(self):
        output = "bbox.json"
        if self.output_eval:
            output = os.path.join(self.output_eval, output)
            with open(output, 'w') as f:
                json.dump(self.results, f)
                logger.info('The bbox result is saved to bbox.json.')
        if self.save_prediction_only:
            return

        logger.info("Accumulating evaluatation results...")
        self.detection_map.accumulate()

    def log(self):
        map_stat = 100. * self.detection_map.get_map()
        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
                                                       self.map_type, map_stat))

    def get_results(self):
        return {'bbox': [self.detection_map.get_map()]}


class WiderFaceMetric(Metric):
    def __init__(self, iou_thresh=0.5):
        self.iou_thresh = iou_thresh
        self.reset()

    def reset(self):
        self.pred_boxes_list = []
        self.gt_boxes_list = []
        self.aps = []

        self.hard_ignore_list = []
        self.medium_ignore_list = []
        self.easy_ignore_list = []

    def update(self, data, outs):
        batch_pred_bboxes = outs['bbox']
        batch_pred_bboxes_num = outs['bbox_num']
        assert len(batch_pred_bboxes_num) == len(data['gt_bbox'])
        batch_size = len(data['gt_bbox'])
        box_cnt = 0
        for batch_id in range(batch_size):
            pred_bboxes_num = batch_pred_bboxes_num[batch_id]
            pred_bboxes = batch_pred_bboxes[box_cnt: box_cnt + 
                                            pred_bboxes_num].numpy() 
            box_cnt += pred_bboxes_num

            det_conf = pred_bboxes[:, 1]
            det_xmin = pred_bboxes[:, 2]
            det_ymin = pred_bboxes[:, 3]
            det_xmax = pred_bboxes[:, 4]
            det_ymax = pred_bboxes[:, 5]
            det = np.column_stack((det_xmin, det_ymin, det_xmax, 
                                   det_ymax, det_conf))
            self.pred_boxes_list.append(det) # xyxy conf
            self.gt_boxes_list.append(data['gt_ori_bbox'][batch_id].numpy()) # xywh
            self.hard_ignore_list.append(
                data['gt_hard_ignore'][batch_id].numpy())
            self.medium_ignore_list.append(
                data['gt_medium_ignore'][batch_id].numpy())
            self.easy_ignore_list.append(
                data['gt_easy_ignore'][batch_id].numpy())
    
    def accumulate(self):
        total_num = len(self.gt_boxes_list)
        settings = ['easy', 'medium', 'hard']
        setting_ingores = [self.easy_ignore_list, 
                           self.medium_ignore_list, 
                           self.hard_ignore_list]
        thresh_num = 1000
        aps = []
        for setting_id in range(3):
            count_face = 0
            pr_curve = np.zeros((thresh_num, 2)).astype(np.float32)
            gt_ignore_list = setting_ingores[setting_id]
            for i in range(total_num):
                pred_boxes = self.pred_boxes_list[i] # xyxy conf
                gt_boxes = self.gt_boxes_list[i] # xywh
                ignore = gt_ignore_list[i]
                count_face += np.sum(ignore)

                if len(gt_boxes) == 0 or len(pred_boxes) == 0:
                    continue
                pred_recall, proposal_list = image_eval(pred_boxes, gt_boxes, 
                                                        ignore, self.iou_thresh)
                _img_pr_info = img_pr_info(thresh_num, pred_boxes,
                                           proposal_list, pred_recall)
                pr_curve += _img_pr_info
            pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)

            propose = pr_curve[:, 0]
            recall = pr_curve[:, 1]

            ap = voc_ap(recall, propose)
            aps.append(ap)
        self.aps = aps
    
    def log(self):
        logger.info("==================== Results ====================")
        logger.info("Easy   Val AP: {}".format(self.aps[0]))
        logger.info("Medium Val AP: {}".format(self.aps[1]))
        logger.info("Hard   Val AP: {}".format(self.aps[2]))
        logger.info("=================================================")
    
    def get_results(self):
        return {
            'easy_ap': self.aps[0],
            'medium_ap': self.aps[1],
            'hard_ap': self.aps[2]}

class RBoxMetric(Metric):
    def __init__(self, anno_file, **kwargs):
        self.anno_file = anno_file
        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
        self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
        self.classwise = kwargs.get('classwise', False)
        self.output_eval = kwargs.get('output_eval', None)
        self.save_prediction_only = kwargs.get('save_prediction_only', False)
        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
        self.map_type = kwargs.get('map_type', '11point')
        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
        self.imid2path = kwargs.get('imid2path', None)
        class_num = len(self.catid2name)
        self.detection_map = DetectionMAP(
            class_num=class_num,
            overlap_thresh=self.overlap_thresh,
            map_type=self.map_type,
            is_bbox_normalized=False,
            evaluate_difficult=self.evaluate_difficult,
            catid2name=self.catid2name,
            classwise=self.classwise)

        self.reset()

    def reset(self):
        self.results = []
        self.detection_map.reset()

    def update(self, inputs, outputs):
        outs = {}
        # outputs Tensor -> numpy.ndarray
        for k, v in outputs.items():
            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v

        im_id = inputs['im_id']
        im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
        outs['im_id'] = im_id

        infer_results = get_infer_results(outs, self.clsid2catid)
        infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
        self.results += infer_results
        if self.save_prediction_only:
            return

        gt_boxes = inputs['gt_poly']
        gt_labels = inputs['gt_class']

        if 'scale_factor' in inputs:
            scale_factor = inputs['scale_factor'].numpy() if isinstance(
                inputs['scale_factor'],
                paddle.Tensor) else inputs['scale_factor']
        else:
            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')

        for i in range(len(gt_boxes)):
            gt_box = gt_boxes[i].numpy() if isinstance(
                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
            h, w = scale_factor[i]
            gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
            gt_label = gt_labels[i].numpy() if isinstance(
                gt_labels[i], paddle.Tensor) else gt_labels[i]
            gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
            bbox = [
                res['bbox'] for res in infer_results
                if int(res['image_id']) == int(im_id[i])
            ]
            score = [
                res['score'] for res in infer_results
                if int(res['image_id']) == int(im_id[i])
            ]
            label = [
                self.catid2clsid[int(res['category_id'])]
                for res in infer_results
                if int(res['image_id']) == int(im_id[i])
            ]
            self.detection_map.update(bbox, score, label, gt_box, gt_label)

    def save_results(self, results, output_dir, imid2path):
        if imid2path:
            data_dicts = defaultdict(list)
            for result in results:
                image_id = result['image_id']
                data_dicts[image_id].append(result)

            for image_id, image_path in imid2path.items():
                basename = os.path.splitext(os.path.split(image_path)[-1])[0]
                output = os.path.join(output_dir, "{}.txt".format(basename))
                dets = data_dicts.get(image_id, [])
                with open(output, 'w') as f:
                    for det in dets:
                        catid, bbox, score = det['category_id'], det[
                            'bbox'], det['score']
                        bbox_pred = '{} {} '.format(self.catid2name[catid],
                                                    score) + ' '.join(
                                                        [str(e) for e in bbox])
                        f.write(bbox_pred + '\n')

            logger.info('The bbox result is saved to {}.'.format(output_dir))
        else:
            output = os.path.join(output_dir, "bbox.json")
            with open(output, 'w') as f:
                json.dump(results, f)

            logger.info('The bbox result is saved to {}.'.format(output))

    def accumulate(self):
        if self.output_eval:
            self.save_results(self.results, self.output_eval, self.imid2path)

        if not self.save_prediction_only:
            logger.info("Accumulating evaluatation results...")
            self.detection_map.accumulate()

    def log(self):
        map_stat = 100. * self.detection_map.get_map()
        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
                                                       self.map_type, map_stat))

    def get_results(self):
        return {'bbox': [self.detection_map.get_map()]}


class SNIPERCOCOMetric(COCOMetric):
    def __init__(self, anno_file, **kwargs):
        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
        self.dataset = kwargs["dataset"]
        self.chip_results = []

    def reset(self):
        # only bbox and mask evaluation support currently
        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
        self.eval_results = {}
        self.chip_results = []

    def update(self, inputs, outputs):
        outs = {}
        # outputs Tensor -> numpy.ndarray
        for k, v in outputs.items():
            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v

        im_id = inputs['im_id']
        outs['im_id'] = im_id.numpy() if isinstance(im_id,
                                                    paddle.Tensor) else im_id

        self.chip_results.append(outs)

    def accumulate(self):
        results = self.dataset.anno_cropper.aggregate_chips_detections(
            self.chip_results)
        for outs in results:
            infer_results = get_infer_results(
                outs, self.clsid2catid, bias=self.bias)
            self.results['bbox'] += infer_results[
                'bbox'] if 'bbox' in infer_results else []

        super(SNIPERCOCOMetric, self).accumulate()


================================================
FILE: ppdet/metrics/mot_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import copy
import sys
import math
from collections import defaultdict
import numpy as np

from ppdet.modeling.bbox_utils import bbox_iou_np_expand
from .map_utils import ap_per_class
from .metrics import Metric
from .munkres import Munkres

try:
    import motmetrics as mm
    mm.lap.default_solver = 'lap'
except:
    print(
        'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
    )
    pass

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']


def read_mot_results(filename, is_gt=False, is_ignore=False):
    valid_label = [1]
    ignore_labels = [2, 7, 8, 12]  # only in motchallenge datasets like 'MOT16'
    if is_gt:
        logger.info(
            "In MOT16/17 dataset the valid_label of ground truth is '{}', "
            "in other dataset it should be '0' for single classs MOT.".format(
                valid_label[0]))
    results_dict = dict()
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            for line in f.readlines():
                linelist = line.split(',')
                if len(linelist) < 7:
                    continue
                fid = int(linelist[0])
                if fid < 1:
                    continue
                results_dict.setdefault(fid, list())

                if is_gt:
                    label = int(float(linelist[7]))
                    mark = int(float(linelist[6]))
                    if mark == 0 or label not in valid_label:
                        continue
                    score = 1
                elif is_ignore:
                    if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:
                        label = int(float(linelist[7]))
                        vis_ratio = float(linelist[8])
                        if label not in ignore_labels and vis_ratio >= 0:
                            continue
                    else:
                        continue
                    score = 1
                else:
                    score = float(linelist[6])

                tlwh = tuple(map(float, linelist[2:6]))
                target_id = int(linelist[1])

                results_dict[fid].append((tlwh, target_id, score))
    return results_dict


"""
MOT dataset label list, see in https://motchallenge.net
labels={'ped', ...			    % 1
        'person_on_vhcl', ...	% 2
        'car', ...				% 3
        'bicycle', ...			% 4
        'mbike', ...			% 5
        'non_mot_vhcl', ...		% 6
        'static_person', ...	% 7
        'distractor', ...		% 8
        'occluder', ...			% 9
        'occluder_on_grnd', ...	% 10
        'occluder_full', ...	% 11
        'reflection', ...		% 12
        'crowd' ...			    % 13
};
"""


def unzip_objs(objs):
    if len(objs) > 0:
        tlwhs, ids, scores = zip(*objs)
    else:
        tlwhs, ids, scores = [], [], []
    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
    return tlwhs, ids, scores


class MOTEvaluator(object):
    def __init__(self, data_root, seq_name, data_type):
        self.data_root = data_root
        self.seq_name = seq_name
        self.data_type = data_type

        self.load_annotations()
        try:
            import motmetrics as mm
            mm.lap.default_solver = 'lap'
        except Exception as e:
            raise RuntimeError(
                'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
            )
        self.reset_accumulator()

    def load_annotations(self):
        assert self.data_type == 'mot'
        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
                                   'gt.txt')
        if not os.path.exists(gt_filename):
            logger.warning(
                "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF."
            )
        self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
        self.gt_ignore_frame_dict = read_mot_results(
            gt_filename, is_ignore=True)

    def reset_accumulator(self):
        self.acc = mm.MOTAccumulator(auto_id=True)

    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
        # results
        trk_tlwhs = np.copy(trk_tlwhs)
        trk_ids = np.copy(trk_ids)

        # gts
        gt_objs = self.gt_frame_dict.get(frame_id, [])
        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]

        # ignore boxes
        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
        ignore_tlwhs = unzip_objs(ignore_objs)[0]

        # remove ignored results
        keep = np.ones(len(trk_tlwhs), dtype=bool)
        iou_distance = mm.distances.iou_matrix(
            ignore_tlwhs, trk_tlwhs, max_iou=0.5)
        if len(iou_distance) > 0:
            match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
            match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
            match_ious = iou_distance[match_is, match_js]

            match_js = np.asarray(match_js, dtype=int)
            match_js = match_js[np.logical_not(np.isnan(match_ious))]
            keep[match_js] = False
            trk_tlwhs = trk_tlwhs[keep]
            trk_ids = trk_ids[keep]

        # get distance matrix
        iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)

        # acc
        self.acc.update(gt_ids, trk_ids, iou_distance)

        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
                                                            'last_mot_events'):
            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
        else:
            events = None
        return events

    def eval_file(self, filename):
        self.reset_accumulator()

        result_frame_dict = read_mot_results(filename, is_gt=False)
        frames = sorted(list(set(result_frame_dict.keys())))
        for frame_id in frames:
            trk_objs = result_frame_dict.get(frame_id, [])
            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)

        return self.acc

    @staticmethod
    def get_summary(accs,
                    names,
                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
                             'precision', 'recall')):
        names = copy.deepcopy(names)
        if metrics is None:
            metrics = mm.metrics.motchallenge_metrics
        metrics = copy.deepcopy(metrics)

        mh = mm.metrics.create()
        summary = mh.compute_many(
            accs, metrics=metrics, names=names, generate_overall=True)
        return summary

    @staticmethod
    def save_summary(summary, filename):
        import pandas as pd
        writer = pd.ExcelWriter(filename)
        summary.to_excel(writer)
        writer.save()


class MOTMetric(Metric):
    def __init__(self, save_summary=False):
        self.save_summary = save_summary
        self.MOTEvaluator = MOTEvaluator
        self.result_root = None
        self.reset()

    def reset(self):
        self.accs = []
        self.seqs = []

    def update(self, data_root, seq, data_type, result_root, result_filename):
        evaluator = self.MOTEvaluator(data_root, seq, data_type)
        self.accs.append(evaluator.eval_file(result_filename))
        self.seqs.append(seq)
        self.result_root = result_root

    def accumulate(self):
        metrics = mm.metrics.motchallenge_metrics
        mh = mm.metrics.create()
        summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
        self.strsummary = mm.io.render_summary(
            summary,
            formatters=mh.formatters,
            namemap=mm.io.motchallenge_metric_names)
        if self.save_summary:
            self.MOTEvaluator.save_summary(
                summary, os.path.join(self.result_root, 'summary.xlsx'))

    def log(self):
        print(self.strsummary)

    def get_results(self):
        return self.strsummary


class JDEDetMetric(Metric):
    # Note this detection AP metric is different from COCOMetric or VOCMetric,
    # and the bboxes coordinates are not scaled to the original image
    def __init__(self, overlap_thresh=0.5):
        self.overlap_thresh = overlap_thresh
        self.reset()

    def reset(self):
        self.AP_accum = np.zeros(1)
        self.AP_accum_count = np.zeros(1)

    def update(self, inputs, outputs):
        bboxes = outputs['bbox'][:, 2:].numpy()
        scores = outputs['bbox'][:, 1].numpy()
        labels = outputs['bbox'][:, 0].numpy()
        bbox_lengths = outputs['bbox_num'].numpy()
        if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:
            return

        gt_boxes = inputs['gt_bbox'].numpy()[0]
        gt_labels = inputs['gt_class'].numpy()[0]
        if gt_labels.shape[0] == 0:
            return

        correct = []
        detected = []
        for i in range(bboxes.shape[0]):
            obj_pred = 0
            pred_bbox = bboxes[i].reshape(1, 4)
            # Compute iou with target boxes
            iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]
            # Extract index of largest overlap
            best_i = np.argmax(iou)
            # If overlap exceeds threshold and classification is correct mark as correct
            if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[
                    best_i] and best_i not in detected:
                correct.append(1)
                detected.append(best_i)
            else:
                correct.append(0)

        # Compute Average Precision (AP) per class
        target_cls = list(gt_labels.T[0])
        AP, AP_class, R, P = ap_per_class(
            tp=correct,
            conf=scores,
            pred_cls=np.zeros_like(scores),
            target_cls=target_cls)
        self.AP_accum_count += np.bincount(AP_class, minlength=1)
        self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)

    def accumulate(self):
        logger.info("Accumulating evaluatation results...")
        self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)

    def log(self):
        map_stat = 100. * self.map_stat
        logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh,
                                                   map_stat))

    def get_results(self):
        return self.map_stat


"""
Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py
"""


class tData:
    """
        Utility class to load data.
    """
    def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\
                 obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\
                 X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):
        """
            Constructor, initializes the object given the parameters.
        """
        self.frame = frame
        self.track_id = track_id
        self.obj_type = obj_type
        self.truncation = truncation
        self.occlusion = occlusion
        self.obs_angle = obs_angle
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.w = w
        self.h = h
        self.l = l
        self.X = X
        self.Y = Y
        self.Z = Z
        self.yaw = yaw
        self.score = score
        self.ignored = False
        self.valid = False
        self.tracker = -1

    def __str__(self):
        attrs = vars(self)
        return '\n'.join("%s: %s" % item for item in attrs.items())


class KITTIEvaluation(object):
    """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)
             MOTA	- Multi-object tracking accuracy in [0,100]
             MOTP	- Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)
             MOTAL	- Multi-object tracking accuracy in [0,100] with log10(id-switches)

             id-switches - number of id switches
             fragments   - number of fragmentations

             MT, PT, ML	- number of mostly tracked, partially tracked and mostly lost trajectories

             recall	        - recall = percentage of detected targets
             precision	    - precision = percentage of correctly detected targets
             FAR		    - number of false alarms per frame
             falsepositives - number of false positives (FP)
             missed         - number of missed targets (FN)
    """
    def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\
                min_height = 25, max_occlusion = 2, cls="car",\
                n_frames=[], seqs=[], n_sequences=0):
        # get number of sequences and
        # get number of frames per sequence from test mapping
        # (created while extracting the benchmark)
        self.gt_path = os.path.join(gt_path, "../labels")
        self.n_frames = n_frames
        self.sequence_name = seqs
        self.n_sequences = n_sequences

        self.cls = cls  # class to evaluate, i.e. pedestrian or car

        self.result_path = result_path

        # statistics and numbers for evaluation
        self.n_gt = 0  # number of ground truth detections minus ignored false negatives and true positives
        self.n_igt = 0  # number of ignored ground truth detections
        self.n_gts = [
        ]  # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE
        self.n_igts = [
        ]  # number of ground ignored truth detections PER SEQUENCE
        self.n_gt_trajectories = 0
        self.n_gt_seq = []
        self.n_tr = 0  # number of tracker detections minus ignored tracker detections
        self.n_trs = [
        ]  # number of tracker detections minus ignored tracker detections PER SEQUENCE
        self.n_itr = 0  # number of ignored tracker detections
        self.n_itrs = []  # number of ignored tracker detections PER SEQUENCE
        self.n_igttr = 0  # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored
        self.n_tr_trajectories = 0
        self.n_tr_seq = []
        self.MOTA = 0
        self.MOTP = 0
        self.MOTAL = 0
        self.MODA = 0
        self.MODP = 0
        self.MODP_t = []
        self.recall = 0
        self.precision = 0
        self.F1 = 0
        self.FAR = 0
        self.total_cost = 0
        self.itp = 0  # number of ignored true positives
        self.itps = []  # number of ignored true positives PER SEQUENCE
        self.tp = 0  # number of true positives including ignored true positives!
        self.tps = [
        ]  # number of true positives including ignored true positives PER SEQUENCE
        self.fn = 0  # number of false negatives WITHOUT ignored false negatives
        self.fns = [
        ]  # number of false negatives WITHOUT ignored false negatives PER SEQUENCE
        self.ifn = 0  # number of ignored false negatives
        self.ifns = []  # number of ignored false negatives PER SEQUENCE
        self.fp = 0  # number of false positives
        # a bit tricky, the number of ignored false negatives and ignored true positives 
        # is subtracted, but if both tracker detection and ground truth detection
        # are ignored this number is added again to avoid double counting
        self.fps = []  # above PER SEQUENCE
        self.mme = 0
        self.fragments = 0
        self.id_switches = 0
        self.MT = 0
        self.PT = 0
        self.ML = 0

        self.min_overlap = min_overlap  # minimum bounding box overlap for 3rd party metrics
        self.max_truncation = max_truncation  # maximum truncation of an object for evaluation
        self.max_occlusion = max_occlusion  # maximum occlusion of an object for evaluation
        self.min_height = min_height  # minimum height of an object for evaluation
        self.n_sample_points = 500

        # this should be enough to hold all groundtruth trajectories
        # is expanded if necessary and reduced in any case
        self.gt_trajectories = [[] for x in range(self.n_sequences)]
        self.ign_trajectories = [[] for x in range(self.n_sequences)]

    def loadGroundtruth(self):
        try:
            self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True)
        except IOError:
            return False
        return True

    def loadTracker(self):
        try:
            if not self._loadData(
                    self.result_path, cls=self.cls, loading_groundtruth=False):
                return False
        except IOError:
            return False
        return True

    def _loadData(self,
                  root_dir,
                  cls,
                  min_score=-1000,
                  loading_groundtruth=False):
        """
            Generic loader for ground truth and tracking data.
            Use loadGroundtruth() or loadTracker() to load this data.
            Loads detections in KITTI format from textfiles.
        """
        # construct objectDetections object to hold detection data
        t_data = tData()
        data = []
        eval_2d = True
        eval_3d = True

        seq_data = []
        n_trajectories = 0
        n_trajectories_seq = []
        for seq, s_name in enumerate(self.sequence_name):
            i = 0
            filename = os.path.join(root_dir, "%s.txt" % s_name)
            f = open(filename, "r")

            f_data = [
                [] for x in range(self.n_frames[seq])
            ]  # current set has only 1059 entries, sufficient length is checked anyway
            ids = []
            n_in_seq = 0
            id_frame_cache = []
            for line in f:
                # KITTI tracking benchmark data format:
                # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)
                line = line.strip()
                fields = line.split(" ")
                # classes that should be loaded (ignored neighboring classes)
                if "car" in cls.lower():
                    classes = ["car", "van"]
                elif "pedestrian" in cls.lower():
                    classes = ["pedestrian", "person_sitting"]
                else:
                    classes = [cls.lower()]
                classes += ["dontcare"]
                if not any([s for s in classes if s in fields[2].lower()]):
                    continue
                # get fields from table
                t_data.frame = int(float(fields[0]))  # frame
                t_data.track_id = int(float(fields[1]))  # id
                t_data.obj_type = fields[
                    2].lower()  # object type [car, pedestrian, cyclist, ...]
                t_data.truncation = int(
                    float(fields[3]))  # truncation [-1,0,1,2]
                t_data.occlusion = int(
                    float(fields[4]))  # occlusion  [-1,0,1,2]
                t_data.obs_angle = float(fields[5])  # observation angle [rad]
                t_data.x1 = float(fields[6])  # left   [px]
                t_data.y1 = float(fields[7])  # top    [px]
                t_data.x2 = float(fields[8])  # right  [px]
                t_data.y2 = float(fields[9])  # bottom [px]
                t_data.h = float(fields[10])  # height [m]
                t_data.w = float(fields[11])  # width  [m]
                t_data.l = float(fields[12])  # length [m]
                t_data.X = float(fields[13])  # X [m]
                t_data.Y = float(fields[14])  # Y [m]
                t_data.Z = float(fields[15])  # Z [m]
                t_data.yaw = float(fields[16])  # yaw angle [rad]
                if not loading_groundtruth:
                    if len(fields) == 17:
                        t_data.score = -1
                    elif len(fields) == 18:
                        t_data.score = float(fields[17])  # detection score
                    else:
                        logger.info("file is not in KITTI format")
                        return

                # do not consider objects marked as invalid
                if t_data.track_id is -1 and t_data.obj_type != "dontcare":
                    continue

                idx = t_data.frame
                # check if length for frame data is sufficient
                if idx >= len(f_data):
                    print("extend f_data", idx, len(f_data))
                    f_data += [[] for x in range(max(500, idx - len(f_data)))]
                try:
                    id_frame = (t_data.frame, t_data.track_id)
                    if id_frame in id_frame_cache and not loading_groundtruth:
                        logger.info(
                            "track ids are not unique for sequence %d: frame %d"
                            % (seq, t_data.frame))
                        logger.info(
                            "track id %d occurred at least twice for this frame"
                            % t_data.track_id)
                        logger.info("Exiting...")
                        #continue # this allows to evaluate non-unique result files
                        return False
                    id_frame_cache.append(id_frame)
                    f_data[t_data.frame].append(copy.copy(t_data))
                except:
                    print(len(f_data), idx)
                    raise

                if t_data.track_id not in ids and t_data.obj_type != "dontcare":
                    ids.append(t_data.track_id)
                    n_trajectories += 1
                    n_in_seq += 1

                # check if uploaded data provides information for 2D and 3D evaluation
                if not loading_groundtruth and eval_2d is True and (
                        t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or
                        t_data.y2 == -1):
                    eval_2d = False
                if not loading_groundtruth and eval_3d is True and (
                        t_data.X == -1000 or t_data.Y == -1000 or
                        t_data.Z == -1000):
                    eval_3d = False

            # only add existing frames
            n_trajectories_seq.append(n_in_seq)
            seq_data.append(f_data)
            f.close()

        if not loading_groundtruth:
            self.tracker = seq_data
            self.n_tr_trajectories = n_trajectories
            self.eval_2d = eval_2d
            self.eval_3d = eval_3d
            self.n_tr_seq = n_trajectories_seq
            if self.n_tr_trajectories == 0:
                return False
        else:
            # split ground truth and DontCare areas
            self.dcareas = []
            self.groundtruth = []
            for seq_idx in range(len(seq_data)):
                seq_gt = seq_data[seq_idx]
                s_g, s_dc = [], []
                for f in range(len(seq_gt)):
                    all_gt = seq_gt[f]
                    g, dc = [], []
                    for gg in all_gt:
                        if gg.obj_type == "dontcare":
                            dc.append(gg)
                        else:
                            g.append(gg)
                    s_g.append(g)
                    s_dc.append(dc)
                self.dcareas.append(s_dc)
                self.groundtruth.append(s_g)
            self.n_gt_seq = n_trajectories_seq
            self.n_gt_trajectories = n_trajectories
        return True

    def boxoverlap(self, a, b, criterion="union"):
        """
            boxoverlap computes intersection over union for bbox a and b in KITTI format.
            If the criterion is 'union', overlap = (a inter b) / a union b).
            If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.
        """
        x1 = max(a.x1, b.x1)
        y1 = max(a.y1, b.y1)
        x2 = min(a.x2, b.x2)
        y2 = min(a.y2, b.y2)

        w = x2 - x1
        h = y2 - y1

        if w <= 0. or h <= 0.:
            return 0.
        inter = w * h
        aarea = (a.x2 - a.x1) * (a.y2 - a.y1)
        barea = (b.x2 - b.x1) * (b.y2 - b.y1)
        # intersection over union overlap
        if criterion.lower() == "union":
            o = inter / float(aarea + barea - inter)
        elif criterion.lower() == "a":
            o = float(inter) / float(aarea)
        else:
            raise TypeError("Unkown type for criterion")
        return o

    def compute3rdPartyMetrics(self):
        """
            Computes the metrics defined in
                - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics
                  MOTA, MOTAL, MOTP
                - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows
                  MT/PT/ML
        """
        # construct Munkres object for Hungarian Method association
        hm = Munkres()
        max_cost = 1e9

        # go through all frames and associate ground truth and tracker results
        # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections
        fr, ids = 0, 0
        for seq_idx in range(len(self.groundtruth)):
            seq_gt = self.groundtruth[seq_idx]
            seq_dc = self.dcareas[seq_idx]  # don't care areas
            seq_tracker = self.tracker[seq_idx]
            seq_trajectories = defaultdict(list)
            seq_ignored = defaultdict(list)

            # statistics over the current sequence, check the corresponding
            # variable comments in __init__ to get their meaning
            seqtp = 0
            seqitp = 0
            seqfn = 0
            seqifn = 0
            seqfp = 0
            seqigt = 0
            seqitr = 0

            last_ids = [[], []]
            n_gts = 0
            n_trs = 0

            for f in range(len(seq_gt)):
                g = seq_gt[f]
                dc = seq_dc[f]

                t = seq_tracker[f]
                # counting total number of ground truth and tracker objects
                self.n_gt += len(g)
                self.n_tr += len(t)

                n_gts += len(g)
                n_trs += len(t)

                # use hungarian method to associate, using boxoverlap 0..1 as cost
                # build cost matrix
                cost_matrix = []
                this_ids = [[], []]
                for gg in g:
                    # save current ids
                    this_ids[0].append(gg.track_id)
                    this_ids[1].append(-1)
                    gg.tracker = -1
                    gg.id_switch = 0
                    gg.fragmentation = 0
                    cost_row = []
                    for tt in t:
                        # overlap == 1 is cost ==0
                        c = 1 - self.boxoverlap(gg, tt)
                        # gating for boxoverlap
                        if c <= self.min_overlap:
                            cost_row.append(c)
                        else:
                            cost_row.append(max_cost)  # = 1e9
                    cost_matrix.append(cost_row)
                    # all ground truth trajectories are initially not associated
                    # extend groundtruth trajectories lists (merge lists)
                    seq_trajectories[gg.track_id].append(-1)
                    seq_ignored[gg.track_id].append(False)

                if len(g) is 0:
                    cost_matrix = [[]]
                # associate
                association_matrix = hm.compute(cost_matrix)

                # tmp variables for sanity checks and MODP computation
                tmptp = 0
                tmpfp = 0
                tmpfn = 0
                tmpc = 0  # this will sum up the overlaps for all true positives
                tmpcs = [0] * len(
                    g)  # this will save the overlaps for all true positives
                # the reason is that some true positives might be ignored
                # later such that the corrsponding overlaps can
                # be subtracted from tmpc for MODP computation

                # mapping for tracker ids and ground truth ids
                for row, col in association_matrix:
                    # apply gating on boxoverlap
                    c = cost_matrix[row][col]
                    if c < max_cost:
                        g[row].tracker = t[col].track_id
                        this_ids[1][row] = t[col].track_id
                        t[col].valid = True
                        g[row].distance = c
                        self.total_cost += 1 - c
                        tmpc += 1 - c
                        tmpcs[row] = 1 - c
                        seq_trajectories[g[row].track_id][-1] = t[col].track_id

                        # true positives are only valid associations
                        self.tp += 1
                        tmptp += 1
                    else:
                        g[row].tracker = -1
                        self.fn += 1
                        tmpfn += 1

                # associate tracker and DontCare areas
                # ignore tracker in neighboring classes
                nignoredtracker = 0  # number of ignored tracker detections
                ignoredtrackers = dict()  # will associate the track_id with -1
                # if it is not ignored and 1 if it is
                # ignored;
                # this is used to avoid double counting ignored
                # cases, see the next loop

                for tt in t:
                    ignoredtrackers[tt.track_id] = -1
                    # ignore detection if it belongs to a neighboring class or is
                    # smaller or equal to the minimum height

                    tt_height = abs(tt.y1 - tt.y2)
                    if ((self.cls == "car" and tt.obj_type == "van") or
                        (self.cls == "pedestrian" and
                         tt.obj_type == "person_sitting") or
                            tt_height <= self.min_height) and not tt.valid:
                        nignoredtracker += 1
                        tt.ignored = True
                        ignoredtrackers[tt.track_id] = 1
                        continue
                    for d in dc:
                        overlap = self.boxoverlap(tt, d, "a")
                        if overlap > 0.5 and not tt.valid:
                            tt.ignored = True
                            nignoredtracker += 1
                            ignoredtrackers[tt.track_id] = 1
                            break

                # check for ignored FN/TP (truncation or neighboring object class)
                ignoredfn = 0  # the number of ignored false negatives
                nignoredtp = 0  # the number of ignored true positives
                nignoredpairs = 0  # the number of ignored pairs, i.e. a true positive
                # which is ignored but where the associated tracker
                # detection has already been ignored

                gi = 0
                for gg in g:
                    if gg.tracker < 0:
                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
                            seq_ignored[gg.track_id][-1] = True
                            gg.ignored = True
                            ignoredfn += 1

                    elif gg.tracker >= 0:
                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):

                            seq_ignored[gg.track_id][-1] = True
                            gg.ignored = True
                            nignoredtp += 1

                            # if the associated tracker detection is already ignored,
                            # we want to avoid double counting ignored detections
                            if ignoredtrackers[gg.tracker] > 0:
                                nignoredpairs += 1

                            # for computing MODP, the overlaps from ignored detections
                            # are subtracted
                            tmpc -= tmpcs[gi]
                    gi += 1

                # the below might be confusion, check the comments in __init__
                # to see what the individual statistics represent

                # correct TP by number of ignored TP due to truncation
                # ignored TP are shown as tracked in visualization
                tmptp -= nignoredtp

                # count the number of ignored true positives
                self.itp += nignoredtp

                # adjust the number of ground truth objects considered
                self.n_gt -= (ignoredfn + nignoredtp)

                # count the number of ignored ground truth objects
                self.n_igt += ignoredfn + nignoredtp

                # count the number of ignored tracker objects
                self.n_itr += nignoredtracker

                # count the number of ignored pairs, i.e. associated tracker and
                # ground truth objects that are both ignored
                self.n_igttr += nignoredpairs

                # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes
                tmpfn += len(g) - len(association_matrix) - ignoredfn
                self.fn += len(g) - len(association_matrix) - ignoredfn
                self.ifn += ignoredfn

                # false positives = tracker bboxes - associated tracker bboxes
                # mismatches (mme_t)
                tmpfp += len(
                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
                self.fp += len(
                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs

                # update sequence data
                seqtp += tmptp
                seqitp += nignoredtp
                seqfp += tmpfp
                seqfn += tmpfn
                seqifn += ignoredfn
                seqigt += ignoredfn + nignoredtp
                seqitr += nignoredtracker

                # sanity checks
                # - the number of true positives minues ignored true positives
                #   should be greater or equal to 0
                # - the number of false negatives should be greater or equal to 0
                # - the number of false positives needs to be greater or equal to 0
                #   otherwise ignored detections might be counted double
                # - the number of counted true positives (plus ignored ones)
                #   and the number of counted false negatives (plus ignored ones)
                #   should match the total number of ground truth objects
                # - the number of counted true positives (plus ignored ones)
                #   and the number of counted false positives
                #   plus the number of ignored tracker detections should
                #   match the total number of tracker detections; note that
                #   nignoredpairs is subtracted here to avoid double counting
                #   of ignored detection sin nignoredtp and nignoredtracker
                if tmptp < 0:
                    print(tmptp, nignoredtp)
                    raise NameError("Something went wrong! TP is negative")
                if tmpfn < 0:
                    print(tmpfn,
                          len(g),
                          len(association_matrix), ignoredfn, nignoredpairs)
                    raise NameError("Something went wrong! FN is negative")
                if tmpfp < 0:
                    print(tmpfp,
                          len(t), tmptp, nignoredtracker, nignoredtp,
                          nignoredpairs)
                    raise NameError("Something went wrong! FP is negative")
                if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:
                    print("seqidx", seq_idx)
                    print("frame ", f)
                    print("TP    ", tmptp)
                    print("FN    ", tmpfn)
                    print("FP    ", tmpfp)
                    print("nGT   ", len(g))
                    print("nAss  ", len(association_matrix))
                    print("ign GT", ignoredfn)
                    print("ign TP", nignoredtp)
                    raise NameError(
                        "Something went wrong! nGroundtruth is not TP+FN")
                if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(
                        t):
                    print(seq_idx, f, len(t), tmptp, tmpfp)
                    print(len(association_matrix), association_matrix)
                    raise NameError(
                        "Something went wrong! nTracker is not TP+FP")

                # check for id switches or fragmentations
                for i, tt in enumerate(this_ids[0]):
                    if tt in last_ids[0]:
                        idx = last_ids[0].index(tt)
                        tid = this_ids[1][i]
                        lid = last_ids[1][idx]
                        if tid != lid and lid != -1 and tid != -1:
                            if g[i].truncation < self.max_truncation:
                                g[i].id_switch = 1
                                ids += 1
                        if tid != lid and lid != -1:
                            if g[i].truncation < self.max_truncation:
                                g[i].fragmentation = 1
                                fr += 1

                # save current index
                last_ids = this_ids
                # compute MOTP_t
                MODP_t = 1
                if tmptp != 0:
                    MODP_t = tmpc / float(tmptp)
                self.MODP_t.append(MODP_t)

            # remove empty lists for current gt trajectories
            self.gt_trajectories[seq_idx] = seq_trajectories
            self.ign_trajectories[seq_idx] = seq_ignored

            # gather statistics for "per sequence" statistics.
            self.n_gts.append(n_gts)
            self.n_trs.append(n_trs)
            self.tps.append(seqtp)
            self.itps.append(seqitp)
            self.fps.append(seqfp)
            self.fns.append(seqfn)
            self.ifns.append(seqifn)
            self.n_igts.append(seqigt)
            self.n_itrs.append(seqitr)

        # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
        n_ignored_tr_total = 0
        for seq_idx, (
                seq_trajectories, seq_ignored
        ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):
            if len(seq_trajectories) == 0:
                continue
            tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5
            n_ignored_tr = 0
            for g, ign_g in zip(seq_trajectories.values(),
                                seq_ignored.values()):
                # all frames of this gt trajectory are ignored
                if all(ign_g):
                    n_ignored_tr += 1
                    n_ignored_tr_total += 1
                    continue
                # all frames of this gt trajectory are not assigned to any detections
                if all([this == -1 for this in g]):
                    tmpML += 1
                    self.ML += 1
                    continue
                # compute tracked frames in trajectory
                last_id = g[0]
                # first detection (necessary to be in gt_trajectories) is always tracked
                tracked = 1 if g[0] >= 0 else 0
                lgt = 0 if ign_g[0] else 1
                for f in range(1, len(g)):
                    if ign_g[f]:
                        last_id = -1
                        continue
                    lgt += 1
                    if last_id != g[f] and last_id != -1 and g[f] != -1 and g[
                            f - 1] != -1:
                        tmpId_switches += 1
                        self.id_switches += 1
                    if f < len(g) - 1 and g[f - 1] != g[
                            f] and last_id != -1 and g[f] != -1 and g[f +
                                                                      1] != -1:
                        tmpFragments += 1
                        self.fragments += 1
                    if g[f] != -1:
                        tracked += 1
                        last_id = g[f]
                # handle last frame; tracked state is handled in for loop (g[f]!=-1)
                if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[
                        f] != -1 and not ign_g[f]:
                    tmpFragments += 1
                    self.fragments += 1

                # compute MT/PT/ML
                tracking_ratio = tracked / float(len(g) - sum(ign_g))
                if tracking_ratio > 0.8:
                    tmpMT += 1
                    self.MT += 1
                elif tracking_ratio < 0.2:
                    tmpML += 1
                    self.ML += 1
                else:  # 0.2 <= tracking_ratio <= 0.8
                    tmpPT += 1
                    self.PT += 1

        if (self.n_gt_trajectories - n_ignored_tr_total) == 0:
            self.MT = 0.
            self.PT = 0.
            self.ML = 0.
        else:
            self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)
            self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)
            self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)

        # precision/recall etc.
        if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
            self.recall = 0.
            self.precision = 0.
        else:
            self.recall = self.tp / float(self.tp + self.fn)
            self.precision = self.tp / float(self.fp + self.tp)
        if (self.recall + self.precision) == 0:
            self.F1 = 0.
        else:
            self.F1 = 2. * (self.precision * self.recall) / (
                self.precision + self.recall)
        if sum(self.n_frames) == 0:
            self.FAR = "n/a"
        else:
            self.FAR = self.fp / float(sum(self.n_frames))

        # compute CLEARMOT
        if self.n_gt == 0:
            self.MOTA = -float("inf")
            self.MODA = -float("inf")
        else:
            self.MOTA = 1 - (self.fn + self.fp + self.id_switches
                             ) / float(self.n_gt)
            self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)
        if self.tp == 0:
            self.MOTP = float("inf")
        else:
            self.MOTP = self.total_cost / float(self.tp)
        if self.n_gt != 0:
            if self.id_switches == 0:
                self.MOTAL = 1 - (self.fn + self.fp + self.id_switches
                                  ) / float(self.n_gt)
            else:
                self.MOTAL = 1 - (self.fn + self.fp +
                                  math.log10(self.id_switches)
                                  ) / float(self.n_gt)
        else:
            self.MOTAL = -float("inf")
        if sum(self.n_frames) == 0:
            self.MODP = "n/a"
        else:
            self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))
        return True

    def createSummary(self):
        summary = ""
        summary += "tracking evaluation summary".center(80, "=") + "\n"
        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)",
                                   self.MOTA) + "\n"
        summary += self.printEntry("Multiple Object Tracking Precision (MOTP)",
                                   self.MOTP) + "\n"
        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)",
                                   self.MOTAL) + "\n"
        summary += self.printEntry("Multiple Object Detection Accuracy (MODA)",
                                   self.MODA) + "\n"
        summary += self.printEntry("Multiple Object Detection Precision (MODP)",
                                   self.MODP) + "\n"
        summary += "\n"
        summary += self.printEntry("Recall", self.recall) + "\n"
        summary += self.printEntry("Precision", self.precision) + "\n"
        summary += self.printEntry("F1", self.F1) + "\n"
        summary += self.printEntry("False Alarm Rate", self.FAR) + "\n"
        summary += "\n"
        summary += self.printEntry("Mostly Tracked", self.MT) + "\n"
        summary += self.printEntry("Partly Tracked", self.PT) + "\n"
        summary += self.printEntry("Mostly Lost", self.ML) + "\n"
        summary += "\n"
        summary += self.printEntry("True Positives", self.tp) + "\n"
        #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n"
        summary += self.printEntry("Ignored True Positives", self.itp) + "\n"
        #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n"

        summary += self.printEntry("False Positives", self.fp) + "\n"
        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
        summary += self.printEntry("False Negatives", self.fn) + "\n"
        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
        self.fp = self.fp / self.n_gt
        self.fn = self.fn / self.n_gt
        self.id_switches = self.id_switches / self.n_gt
        summary += self.printEntry("False Positives Ratio", self.fp) + "\n"
        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
        summary += self.printEntry("False Negatives Ratio", self.fn) + "\n"
        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
        summary += self.printEntry("Ignored False Negatives Ratio",
                                   self.ifn) + "\n"

        #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n"
        summary += self.printEntry("Missed Targets", self.fn) + "\n"
        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
        summary += self.printEntry("Fragmentations", self.fragments) + "\n"
        summary += "\n"
        summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt +
                                   self.n_igt) + "\n"
        #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n"
        summary += self.printEntry("Ignored Ground Truth Objects",
                                   self.n_igt) + "\n"
        #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n"
        summary += self.printEntry("Ground Truth Trajectories",
                                   self.n_gt_trajectories) + "\n"
        summary += "\n"
        summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n"
        #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n"
        summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n"
        #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n"
        summary += self.printEntry("Tracker Trajectories",
                                   self.n_tr_trajectories) + "\n"
        #summary += "\n"
        #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n"
        summary += "=" * 80
        return summary

    def printEntry(self, key, val, width=(70, 10)):
        """
            Pretty print an entry in a table fashion.
        """
        s_out = key.ljust(width[0])
        if type(val) == int:
            s = "%%%dd" % width[1]
            s_out += s % val
        elif type(val) == float:
            s = "%%%df" % (width[1])
            s_out += s % val
        else:
            s_out += ("%s" % val).rjust(width[1])
        return s_out

    def saveToStats(self, save_summary):
        """
            Save the statistics in a whitespace separate file.
        """
        summary = self.createSummary()
        if save_summary:
            filename = os.path.join(self.result_path,
                                    "summary_%s.txt" % self.cls)
            dump = open(filename, "w+")
            dump.write(summary)
            dump.close()
        return summary


class KITTIMOTMetric(Metric):
    def __init__(self, save_summary=True):
        self.save_summary = save_summary
        self.MOTEvaluator = KITTIEvaluation
        self.result_root = None
        self.reset()

    def reset(self):
        self.seqs = []
        self.n_sequences = 0
        self.n_frames = []
        self.strsummary = ''

    def update(self, data_root, seq, data_type, result_root, result_filename):
        assert data_type == 'kitti', "data_type should 'kitti'"
        self.result_root = result_root
        self.gt_path = data_root
        gt_path = '{}/../labels/{}.txt'.format(data_root, seq)
        gt = open(gt_path, "r")
        max_frame = 0
        for line in gt:
            line = line.strip()
            line_list = line.split(" ")
            if int(line_list[0]) > max_frame:
                max_frame = int(line_list[0])
        rs = open(result_filename, "r")
        for line in rs:
            line = line.strip()
            line_list = line.split(" ")
            if int(line_list[0]) > max_frame:
                max_frame = int(line_list[0])
        gt.close()
        rs.close()
        self.n_frames.append(max_frame + 1)
        self.seqs.append(seq)
        self.n_sequences += 1

    def accumulate(self):
        logger.info("Processing Result for KITTI Tracking Benchmark")
        e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\
            n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)
        try:
            if not e.loadTracker():
                return
            logger.info("Loading Results - Success")
            logger.info("Evaluate Object Class: %s" % c.upper())
        except:
            logger.info("Caught exception while loading result data.")
        if not e.loadGroundtruth():
            raise ValueError("Ground truth not found.")
        logger.info("Loading Groundtruth - Success")
        # sanity checks
        if len(e.groundtruth) is not len(e.tracker):
            logger.info(
                "The uploaded data does not provide results for every sequence.")
            return False
        logger.info("Loaded %d Sequences." % len(e.groundtruth))
        logger.info("Start Evaluation...")

        if e.compute3rdPartyMetrics():
            self.strsummary = e.saveToStats(self.save_summary)
        else:
            logger.info(
                "There seem to be no true positives or false positives at all in the submitted data."
            )

    def log(self):
        print(self.strsummary)

    def get_results(self):
        return self.strsummary


================================================
FILE: ppdet/metrics/munkres.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
"""
This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
"""

import sys

__all__ = ['Munkres', 'make_cost_matrix']


class Munkres:
    """
    Calculate the Munkres solution to the classical assignment problem.
    See the module documentation for usage.
    """

    def __init__(self):
        """Create a new instance"""
        self.C = None
        self.row_covered = []
        self.col_covered = []
        self.n = 0
        self.Z0_r = 0
        self.Z0_c = 0
        self.marked = None
        self.path = None

    def make_cost_matrix(profit_matrix, inversion_function):
        """
        **DEPRECATED**

        Please use the module function ``make_cost_matrix()``.
        """
        import munkres
        return munkres.make_cost_matrix(profit_matrix, inversion_function)

    make_cost_matrix = staticmethod(make_cost_matrix)

    def pad_matrix(self, matrix, pad_value=0):
        """
        Pad a possibly non-square matrix to make it square.

        :Parameters:
            matrix : list of lists
                matrix to pad

            pad_value : int
                value to use to pad the matrix

        :rtype: list of lists
        :return: a new, possibly padded, matrix
        """
        max_columns = 0
        total_rows = len(matrix)

        for row in matrix:
            max_columns = max(max_columns, len(row))

        total_rows = max(max_columns, total_rows)

        new_matrix = []
        for row in matrix:
            row_len = len(row)
            new_row = row[:]
            if total_rows > row_len:
                # Row too short. Pad it.
                new_row += [0] * (total_rows - row_len)
            new_matrix += [new_row]

        while len(new_matrix) < total_rows:
            new_matrix += [[0] * total_rows]

        return new_matrix

    def compute(self, cost_matrix):
        """
        Compute the indexes for the lowest-cost pairings between rows and
        columns in the database. Returns a list of (row, column) tuples
        that can be used to traverse the matrix.

        :Parameters:
            cost_matrix : list of lists
                The cost matrix. If this cost matrix is not square, it
                will be padded with zeros, via a call to ``pad_matrix()``.
                (This method does *not* modify the caller's matrix. It
                operates on a copy of the matrix.)

                **WARNING**: This code handles square and rectangular
                matrices. It does *not* handle irregular matrices.

        :rtype: list
        :return: A list of ``(row, column)`` tuples that describe the lowest
                 cost path through the matrix

        """
        self.C = self.pad_matrix(cost_matrix)
        self.n = len(self.C)
        self.original_length = len(cost_matrix)
        self.original_width = len(cost_matrix[0])
        self.row_covered = [False for i in range(self.n)]
        self.col_covered = [False for i in range(self.n)]
        self.Z0_r = 0
        self.Z0_c = 0
        self.path = self.__make_matrix(self.n * 2, 0)
        self.marked = self.__make_matrix(self.n, 0)

        done = False
        step = 1

        steps = {
            1: self.__step1,
            2: self.__step2,
            3: self.__step3,
            4: self.__step4,
            5: self.__step5,
            6: self.__step6
        }

        while not done:
            try:
                func = steps[step]
                step = func()
            except KeyError:
                done = True

        # Look for the starred columns
        results = []
        for i in range(self.original_length):
            for j in range(self.original_width):
                if self.marked[i][j] == 1:
                    results += [(i, j)]

        return results

    def __copy_matrix(self, matrix):
        """Return an exact copy of the supplied matrix"""
        return copy.deepcopy(matrix)

    def __make_matrix(self, n, val):
        """Create an *n*x*n* matrix, populating it with the specific value."""
        matrix = []
        for i in range(n):
            matrix += [[val for j in range(n)]]
        return matrix

    def __step1(self):
        """
        For each row of the matrix, find the smallest element and
        subtract it from every element in its row. Go to Step 2.
        """
        C = self.C
        n = self.n
        for i in range(n):
            minval = min(self.C[i])
            # Find the minimum value for this row and subtract that minimum
            # from every element in the row.
            for j in range(n):
                self.C[i][j] -= minval

        return 2

    def __step2(self):
        """
        Find a zero (Z) in the resulting matrix. If there is no starred
        zero in its row or column, star Z. Repeat for each element in the
        matrix. Go to Step 3.
        """
        n = self.n
        for i in range(n):
            for j in range(n):
                if (self.C[i][j] == 0) and \
                   (not self.col_covered[j]) and \
                   (not self.row_covered[i]):
                    self.marked[i][j] = 1
                    self.col_covered[j] = True
                    self.row_covered[i] = True

        self.__clear_covers()
        return 3

    def __step3(self):
        """
        Cover each column containing a starred zero. If K columns are
        covered, the starred zeros describe a complete set of unique
        assignments. In this case, Go to DONE, otherwise, Go to Step 4.
        """
        n = self.n
        count = 0
        for i in range(n):
            for j in range(n):
                if self.marked[i][j] == 1:
                    self.col_covered[j] = True
                    count += 1

        if count >= n:
            step = 7  # done
        else:
            step = 4

        return step

    def __step4(self):
        """
        Find a noncovered zero and prime it. If there is no starred zero
        in the row containing this primed zero, Go to Step 5. Otherwise,
        cover this row and uncover the column containing the starred
        zero. Continue in this manner until there are no uncovered zeros
        left. Save the smallest uncovered value and Go to Step 6.
        """
        step = 0
        done = False
        row = -1
        col = -1
        star_col = -1
        while not done:
            (row, col) = self.__find_a_zero()
            if row < 0:
                done = True
                step = 6
            else:
                self.marked[row][col] = 2
                star_col = self.__find_star_in_row(row)
                if star_col >= 0:
                    col = star_col
                    self.row_covered[row] = True
                    self.col_covered[col] = False
                else:
                    done = True
                    self.Z0_r = row
                    self.Z0_c = col
                    step = 5

        return step

    def __step5(self):
        """
        Construct a series of alternating primed and starred zeros as
        follows. Let Z0 represent the uncovered primed zero found in Step 4.
        Let Z1 denote the starred zero in the column of Z0 (if any).
        Let Z2 denote the primed zero in the row of Z1 (there will always
        be one). Continue until the series terminates at a primed zero
        that has no starred zero in its column. Unstar each starred zero
        of the series, star each primed zero of the series, erase all
        primes and uncover every line in the matrix. Return to Step 3
        """
        count = 0
        path = self.path
        path[count][0] = self.Z0_r
        path[count][1] = self.Z0_c
        done = False
        while not done:
            row = self.__find_star_in_col(path[count][1])
            if row >= 0:
                count += 1
                path[count][0] = row
                path[count][1] = path[count - 1][1]
            else:
                done = True

            if not done:
                col = self.__find_prime_in_row(path[count][0])
                count += 1
                path[count][0] = path[count - 1][0]
                path[count][1] = col

        self.__convert_path(path, count)
        self.__clear_covers()
        self.__erase_primes()
        return 3

    def __step6(self):
        """
        Add the value found in Step 4 to every element of each covered
        row, and subtract it from every element of each uncovered column.
        Return to Step 4 without altering any stars, primes, or covered
        lines.
        """
        minval = self.__find_smallest()
        for i in range(self.n):
            for j in range(self.n):
                if self.row_covered[i]:
                    self.C[i][j] += minval
                if not self.col_covered[j]:
                    self.C[i][j] -= minval
        return 4

    def __find_smallest(self):
        """Find the smallest uncovered value in the matrix."""
        minval = 2e9  # sys.maxint
        for i in range(self.n):
            for j in range(self.n):
                if (not self.row_covered[i]) and (not self.col_covered[j]):
                    if minval > self.C[i][j]:
                        minval = self.C[i][j]
        return minval

    def __find_a_zero(self):
        """Find the first uncovered element with value 0"""
        row = -1
        col = -1
        i = 0
        n = self.n
        done = False

        while not done:
            j = 0
            while True:
                if (self.C[i][j] == 0) and \
                   (not self.row_covered[i]) and \
                   (not self.col_covered[j]):
                    row = i
                    col = j
                    done = True
                j += 1
                if j >= n:
                    break
            i += 1
            if i >= n:
                done = True

        return (row, col)

    def __find_star_in_row(self, row):
        """
        Find the first starred element in the specified row. Returns
        the column index, or -1 if no starred element was found.
        """
        col = -1
        for j in range(self.n):
            if self.marked[row][j] == 1:
                col = j
                break

        return col

    def __find_star_in_col(self, col):
        """
        Find the first starred element in the specified row. Returns
        the row index, or -1 if no starred element was found.
        """
        row = -1
        for i in range(self.n):
            if self.marked[i][col] == 1:
                row = i
                break

        return row

    def __find_prime_in_row(self, row):
        """
        Find the first prime element in the specified row. Returns
        the column index, or -1 if no starred element was found.
        """
        col = -1
        for j in range(self.n):
            if self.marked[row][j] == 2:
                col = j
                break

        return col

    def __convert_path(self, path, count):
        for i in range(count + 1):
            if self.marked[path[i][0]][path[i][1]] == 1:
                self.marked[path[i][0]][path[i][1]] = 0
            else:
                self.marked[path[i][0]][path[i][1]] = 1

    def __clear_covers(self):
        """Clear all covered matrix cells"""
        for i in range(self.n):
            self.row_covered[i] = False
            self.col_covered[i] = False

    def __erase_primes(self):
        """Erase all prime markings"""
        for i in range(self.n):
            for j in range(self.n):
                if self.marked[i][j] == 2:
                    self.marked[i][j] = 0


def make_cost_matrix(profit_matrix, inversion_function):
    """
    Create a cost matrix from a profit matrix by calling
    'inversion_function' to invert each value. The inversion
    function must take one numeric argument (of any type) and return
    another numeric argument which is presumed to be the cost inverse
    of the original profit.

    This is a static method. Call it like this:

    .. python::

        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)

    For example:

    .. python::

        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)

    :Parameters:
        profit_matrix : list of lists
            The matrix to convert from a profit to a cost matrix

        inversion_function : function
            The function to use to invert each entry in the profit matrix

    :rtype: list of lists
    :return: The converted matrix
    """
    cost_matrix = []
    for row in profit_matrix:
        cost_matrix.append([inversion_function(value) for value in row])
    return cost_matrix


================================================
FILE: ppdet/metrics/pose3d_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
from paddle.distributed import ParallelEnv
import os
import json
from collections import defaultdict, OrderedDict
import numpy as np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['Pose3DEval']


class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def mean_per_joint_position_error(pred, gt, has_3d_joints):
    """ 
    Compute mPJPE
    """
    gt = gt[has_3d_joints == 1]
    gt = gt[:, :, :3]
    pred = pred[has_3d_joints == 1]

    with paddle.no_grad():
        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
        gt = gt - gt_pelvis[:, None, :]
        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
        pred = pred - pred_pelvis[:, None, :]
        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
        return error


def compute_similarity_transform(S1, S2):
    """Computes a similarity transform (sR, t) that takes
    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
    i.e. solves the orthogonal Procrutes problem.
    """
    transposed = False
    if S1.shape[0] != 3 and S1.shape[0] != 2:
        S1 = S1.T
        S2 = S2.T
        transposed = True
    assert (S2.shape[1] == S1.shape[1])

    # 1. Remove mean.
    mu1 = S1.mean(axis=1, keepdims=True)
    mu2 = S2.mean(axis=1, keepdims=True)
    X1 = S1 - mu1
    X2 = S2 - mu2

    # 2. Compute variance of X1 used for scale.
    var1 = np.sum(X1**2)

    # 3. The outer product of X1 and X2.
    K = X1.dot(X2.T)

    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
    # singular vectors of K.
    U, s, Vh = np.linalg.svd(K)
    V = Vh.T
    # Construct Z that fixes the orientation of R to get det(R)=1.
    Z = np.eye(U.shape[0])
    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
    # Construct R.
    R = V.dot(Z.dot(U.T))

    # 5. Recover scale.
    scale = np.trace(R.dot(K)) / var1

    # 6. Recover translation.
    t = mu2 - scale * (R.dot(mu1))

    # 7. Error:
    S1_hat = scale * R.dot(S1) + t

    if transposed:
        S1_hat = S1_hat.T

    return S1_hat


def compute_similarity_transform_batch(S1, S2):
    """Batched version of compute_similarity_transform."""
    S1_hat = np.zeros_like(S1)
    for i in range(S1.shape[0]):
        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
    return S1_hat


def reconstruction_error(S1, S2, reduction='mean'):
    """Do Procrustes alignment and compute reconstruction error."""
    S1_hat = compute_similarity_transform_batch(S1, S2)
    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
    if reduction == 'mean':
        re = re.mean()
    elif reduction == 'sum':
        re = re.sum()
    return re


def all_gather(data):
    if paddle.distributed.get_world_size() == 1:
        return data
    vlist = []
    paddle.distributed.all_gather(vlist, data)
    data = paddle.concat(vlist, 0)
    return data


class Pose3DEval(object):
    def __init__(self, output_eval, save_prediction_only=False):
        super(Pose3DEval, self).__init__()
        self.output_eval = output_eval
        self.res_file = os.path.join(output_eval, "pose3d_results.json")
        self.save_prediction_only = save_prediction_only
        self.reset()

    def reset(self):
        self.PAmPJPE = AverageMeter()
        self.mPJPE = AverageMeter()
        self.eval_results = {}

    def get_human36m_joints(self, input):
        J24_TO_J14 = paddle.to_tensor(
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
        J24_TO_J17 = paddle.to_tensor(
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
        return paddle.index_select(input, J24_TO_J14, axis=1)

    def update(self, inputs, outputs):
        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
                                                           .local_rank))
        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
                                                                .local_rank))
        pred_3d_joints = all_gather(outputs['pose3d'])
        if gt_3d_joints.shape[1] == 24:
            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
        if pred_3d_joints.shape[1] == 24:
            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
                                                  has_3d_joints).mean()
        PAmPJPE_val = reconstruction_error(
            pred_3d_joints.numpy(),
            gt_3d_joints[:, :, :3].numpy(),
            reduction=None).mean()
        count = int(np.sum(has_3d_joints.numpy()))
        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
        self.mPJPE.update(mPJPE_val * 1000., count)

    def accumulate(self):
        if self.save_prediction_only:
            logger.info(f'The pose3d result is saved to {self.res_file} '
                        'and do not evaluate the model.')
            return
        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]

    def log(self):
        if self.save_prediction_only:
            return
        stats_names = ['mPJPE', 'PAmPJPE']
        num_values = len(stats_names)
        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
        print('|---' * (num_values + 1) + '|')

        print(' '.join([
            '| {:.3f}'.format(abs(value))
            for value in self.eval_results['pose3d']
        ]) + ' |')

    def get_results(self):
        return self.eval_results


================================================
FILE: ppdet/metrics/widerface_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import cv2
import numpy as np
from collections import OrderedDict

import paddle

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['face_eval_run', 'lmk2out']


def face_eval_run(model,
                  image_dir,
                  gt_file,
                  pred_dir='output/pred',
                  eval_mode='widerface',
                  multi_scale=False):
    # load ground truth files
    with open(gt_file, 'r') as f:
        gt_lines = f.readlines()
    imid2path = []
    pos_gt = 0
    while pos_gt < len(gt_lines):
        name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
        imid2path.append(name_gt)
        pos_gt += 1
        n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
        pos_gt += 1 + n_gt
    logger.info('The ground truth file load {} images'.format(len(imid2path)))

    dets_dist = OrderedDict()
    for iter_id, im_path in enumerate(imid2path):
        image_path = os.path.join(image_dir, im_path)
        if eval_mode == 'fddb':
            image_path += '.jpg'
        assert os.path.exists(image_path)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if multi_scale:
            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
            det0 = detect_face(model, image, shrink)
            det1 = flip_test(model, image, shrink)
            [det2, det3] = multi_scale_test(model, image, max_shrink)
            det4 = multi_scale_test_pyramid(model, image, max_shrink)
            det = np.row_stack((det0, det1, det2, det3, det4))
            dets = bbox_vote(det)
        else:
            dets = detect_face(model, image, 1)
        if eval_mode == 'widerface':
            save_widerface_bboxes(image_path, dets, pred_dir)
        else:
            dets_dist[im_path] = dets
        if iter_id % 100 == 0:
            logger.info('Test iter {}'.format(iter_id))
    if eval_mode == 'fddb':
        save_fddb_bboxes(dets_dist, pred_dir)
    logger.info("Finish evaluation.")


def detect_face(model, image, shrink):
    image_shape = [image.shape[0], image.shape[1]]
    if shrink != 1:
        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
        image = cv2.resize(image, (w, h))
        image_shape = [h, w]

    img = face_img_process(image)
    image_shape = np.asarray([image_shape])
    scale_factor = np.asarray([[shrink, shrink]])
    data = {
        "image": paddle.to_tensor(
            img, dtype='float32'),
        "im_shape": paddle.to_tensor(
            image_shape, dtype='float32'),
        "scale_factor": paddle.to_tensor(
            scale_factor, dtype='float32')
    }
    model.eval()
    detection = model(data)
    detection = detection['bbox'].numpy()
    # layout: xmin, ymin, xmax. ymax, score
    if np.prod(detection.shape) == 1:
        logger.info("No face detected")
        return np.array([[0, 0, 0, 0, 0]])
    det_conf = detection[:, 1]
    det_xmin = detection[:, 2]
    det_ymin = detection[:, 3]
    det_xmax = detection[:, 4]
    det_ymax = detection[:, 5]

    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
    return det


def flip_test(model, image, shrink):
    img = cv2.flip(image, 1)
    det_f = detect_face(model, img, shrink)
    det_t = np.zeros(det_f.shape)
    img_width = image.shape[1]
    det_t[:, 0] = img_width - det_f[:, 2]
    det_t[:, 1] = det_f[:, 1]
    det_t[:, 2] = img_width - det_f[:, 0]
    det_t[:, 3] = det_f[:, 3]
    det_t[:, 4] = det_f[:, 4]
    return det_t


def multi_scale_test(model, image, max_shrink):
    # Shrink detecting is only used to detect big faces
    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
    det_s = detect_face(model, image, st)
    index = np.where(
        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
        > 30)[0]
    det_s = det_s[index, :]
    # Enlarge one times
    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
    det_b = detect_face(model, image, bt)

    # Enlarge small image x times for small faces
    if max_shrink > 2:
        bt *= 2
        while bt < max_shrink:
            det_b = np.row_stack((det_b, detect_face(model, image, bt)))
            bt *= 2
        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))

    # Enlarged images are only used to detect small faces.
    if bt > 1:
        index = np.where(
            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
        det_b = det_b[index, :]
    # Shrinked images are only used to detect big faces.
    else:
        index = np.where(
            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
        det_b = det_b[index, :]
    return det_s, det_b


def multi_scale_test_pyramid(model, image, max_shrink):
    # Use image pyramids to detect faces
    det_b = detect_face(model, image, 0.25)
    index = np.where(
        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
        > 30)[0]
    det_b = det_b[index, :]

    st = [0.75, 1.25, 1.5, 1.75]
    for i in range(len(st)):
        if st[i] <= max_shrink:
            det_temp = detect_face(model, image, st[i])
            # Enlarged images are only used to detect small faces.
            if st[i] > 1:
                index = np.where(
                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
                det_temp = det_temp[index, :]
            # Shrinked images are only used to detect big faces.
            else:
                index = np.where(
                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
                det_temp = det_temp[index, :]
            det_b = np.row_stack((det_b, det_temp))
    return det_b


def to_chw(image):
    """
    Transpose image from HWC to CHW.
    Args:
        image (np.array): an image with HWC layout.
    """
    # HWC to CHW
    if len(image.shape) == 3:
        image = np.swapaxes(image, 1, 2)
        image = np.swapaxes(image, 1, 0)
    return image


def face_img_process(image,
                     mean=[104., 117., 123.],
                     std=[127.502231, 127.502231, 127.502231]):
    img = np.array(image)
    img = to_chw(img)
    img = img.astype('float32')
    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
    img = [img]
    img = np.array(img)
    return img


def get_shrink(height, width):
    """
    Args:
        height (int): image height.
        width (int): image width.
    """
    # avoid out of memory
    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5

    def get_round(x, loc):
        str_x = str(x)
        if '.' in str_x:
            str_before, str_after = str_x.split('.')
            len_after = len(str_after)
            if len_after >= 3:
                str_final = str_before + '.' + str_after[0:loc]
                return float(str_final)
            else:
                return x

    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
    if max_shrink >= 1.5 and max_shrink < 2:
        max_shrink = max_shrink - 0.1
    elif max_shrink >= 2 and max_shrink < 3:
        max_shrink = max_shrink - 0.2
    elif max_shrink >= 3 and max_shrink < 4:
        max_shrink = max_shrink - 0.3
    elif max_shrink >= 4 and max_shrink < 5:
        max_shrink = max_shrink - 0.4
    elif max_shrink >= 5:
        max_shrink = max_shrink - 0.5
    elif max_shrink <= 0.1:
        max_shrink = 0.1

    shrink = max_shrink if max_shrink < 1 else 1
    return shrink, max_shrink


def bbox_vote(det):
    order = det[:, 4].ravel().argsort()[::-1]
    det = det[order, :]
    if det.shape[0] == 0:
        dets = np.array([[10, 10, 20, 20, 0.002]])
        det = np.empty(shape=[0, 5])
    while det.shape[0] > 0:
        # IOU
        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
        xx1 = np.maximum(det[0, 0], det[:, 0])
        yy1 = np.maximum(det[0, 1], det[:, 1])
        xx2 = np.minimum(det[0, 2], det[:, 2])
        yy2 = np.minimum(det[0, 3], det[:, 3])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        o = inter / (area[0] + area[:] - inter)

        # nms
        merge_index = np.where(o >= 0.3)[0]
        det_accu = det[merge_index, :]
        det = np.delete(det, merge_index, 0)
        if merge_index.shape[0] <= 1:
            if det.shape[0] == 0:
                try:
                    dets = np.row_stack((dets, det_accu))
                except:
                    dets = det_accu
            continue
        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
        max_score = np.max(det_accu[:, 4])
        det_accu_sum = np.zeros((1, 5))
        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
                                      axis=0) / np.sum(det_accu[:, -1:])
        det_accu_sum[:, 4] = max_score
        try:
            dets = np.row_stack((dets, det_accu_sum))
        except:
            dets = det_accu_sum
    dets = dets[0:750, :]
    keep_index = np.where(dets[:, 4] >= 0.01)[0]
    dets = dets[keep_index, :]
    return dets


def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
    image_name = image_path.split('/')[-1]
    image_class = image_path.split('/')[-2]
    odir = os.path.join(output_dir, image_class)
    if not os.path.exists(odir):
        os.makedirs(odir)

    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
    f = open(ofname, 'w')
    f.write('{:s}\n'.format(image_class + '/' + image_name))
    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
    for box_score in bboxes_scores:
        xmin, ymin, xmax, ymax, score = box_score
        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
            xmax - xmin + 1), (ymax - ymin + 1), score))
    f.close()
    logger.info("The predicted result is saved as {}".format(ofname))


def save_fddb_bboxes(bboxes_scores,
                     output_dir,
                     output_fname='pred_fddb_res.txt'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    predict_file = os.path.join(output_dir, output_fname)
    f = open(predict_file, 'w')
    for image_path, dets in bboxes_scores.iteritems():
        f.write('{:s}\n'.format(image_path))
        f.write('{:d}\n'.format(dets.shape[0]))
        for box_score in dets:
            xmin, ymin, xmax, ymax, score = box_score
            width, height = xmax - xmin, ymax - ymin
            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
                    .format(xmin, ymin, width, height, score))
    logger.info("The predicted result is saved as {}".format(predict_file))
    return predict_file


def lmk2out(results, is_bbox_normalized=False):
    """
    Args:
        results: request a dict, should include: `landmark`, `im_id`,
                 if is_bbox_normalized=True, also need `im_shape`.
        is_bbox_normalized: whether or not landmark is normalized.
    """
    xywh_res = []
    for t in results:
        bboxes = t['bbox'][0]
        lengths = t['bbox'][1][0]
        im_ids = np.array(t['im_id'][0]).flatten()
        if bboxes.shape == (1, 1) or bboxes is None:
            continue
        face_index = t['face_index'][0]
        prior_box = t['prior_boxes'][0]
        predict_lmk = t['landmark'][0]
        prior = np.reshape(prior_box, (-1, 4))
        predictlmk = np.reshape(predict_lmk, (-1, 10))

        k = 0
        for a in range(len(lengths)):
            num = lengths[a]
            im_id = int(im_ids[a])
            for i in range(num):
                score = bboxes[k][1]
                theindex = face_index[i][0]
                me_prior = prior[theindex, :]
                lmk_pred = predictlmk[theindex, :]
                prior_w = me_prior[2] - me_prior[0]
                prior_h = me_prior[3] - me_prior[1]
                prior_w_center = (me_prior[2] + me_prior[0]) / 2
                prior_h_center = (me_prior[3] + me_prior[1]) / 2
                lmk_decode = np.zeros((10))
                for j in [0, 2, 4, 6, 8]:
                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center
                for j in [1, 3, 5, 7, 9]:
                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center
                im_shape = t['im_shape'][0][a].tolist()
                image_h, image_w = int(im_shape[0]), int(im_shape[1])
                if is_bbox_normalized:
                    lmk_decode = lmk_decode * np.array([
                        image_w, image_h, image_w, image_h, image_w, image_h,
                        image_w, image_h, image_w, image_h
                    ])
                lmk_res = {
                    'image_id': im_id,
                    'landmark': lmk_decode,
                    'score': score,
                }
                xywh_res.append(lmk_res)
                k += 1
    return xywh_res

def image_eval(pred, gt, ignore, iou_thresh):
    """ single image evaluation
    pred: Nx5 xyxys
    gt: Nx4 xywh
    ignore:
    """
    _pred = pred.copy()
    _gt = gt.copy()
    pred_recall = np.zeros(_pred.shape[0])
    recall_list = np.zeros(_gt.shape[0])
    proposal_list = np.ones(_pred.shape[0])

    _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
    _gt[:, 3] = _gt[:, 3] + _gt[:, 1]

    overlaps = bbox_overlaps(_pred[:, :4], _gt)

    for h in range(_pred.shape[0]):

        gt_overlap = overlaps[h]
        max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
        if max_overlap >= iou_thresh:
            if ignore[max_idx] == 0:
                recall_list[max_idx] = -1
                proposal_list[h] = -1
            elif recall_list[max_idx] == 0:
                recall_list[max_idx] = 1

        r_keep_index = np.where(recall_list == 1)[0]
        pred_recall[h] = len(r_keep_index)
    return pred_recall, proposal_list
    

def bbox_overlaps(boxes1, boxes2):
    """
    Parameters
    ----------
    boxes1: (N, 4) ndarray of float
    boxes2: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes1 and boxes2
    """
    # Calculate the area of each box
    box_areas1 = (boxes1[:, 2] - boxes1[:, 0] + 1) * (
        boxes1[:, 3] - boxes1[:, 1] + 1)
    box_areas2 = (boxes2[:, 2] - boxes2[:, 0] + 1) * (
        boxes2[:, 3] - boxes2[:, 1] + 1)
    # Calculate the intersection areas
    iw = np.minimum(boxes1[:, None, 2], boxes2[None, :, 2]) - np.maximum(
        boxes1[:, None, 0], boxes2[None, :, 0]) + 1
    ih = np.minimum(boxes1[:, None, 3], boxes2[None, :, 3]) - np.maximum(
        boxes1[:, None, 1], boxes2[None, :, 1]) + 1
    # Ensure that the intersection width and height are non-negative
    iw = np.maximum(iw, 0)
    ih = np.maximum(ih, 0)
    # Calculate the intersection area
    intersection = iw * ih
    # Calculate the union area
    union = box_areas1[:, None] + box_areas2[None, :] - intersection
    union = box_areas1[:, None] + box_areas2[None, :] - intersection
    union = np.maximum(union, 1e-8)
    # Calculate the overlaps (intersection over union)
    overlaps = intersection / union
    return overlaps


def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
    pr_info = np.zeros((thresh_num, 2)).astype('float')
    for t in range(thresh_num):

        thresh = 1 - (t+1)/thresh_num
        r_index = np.where(pred_info[:, 4] >= thresh)[0]
        if len(r_index) == 0:
            pr_info[t, 0] = 0
            pr_info[t, 1] = 0
        else:
            r_index = r_index[-1]
            p_index = np.where(proposal_list[:r_index+1] == 1)[0]
            pr_info[t, 0] = len(p_index)
            pr_info[t, 1] = pred_recall[r_index]
    return pr_info


def dataset_pr_info(thresh_num, pr_curve, count_face):
    _pr_curve = np.zeros((thresh_num, 2))
    for i in range(thresh_num):
        _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
        _pr_curve[i, 1] = pr_curve[i, 1] / count_face
    return _pr_curve


def voc_ap(rec, prec):

    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], rec, [1.]))
    mpre = np.concatenate(([0.], prec, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap

================================================
FILE: ppdet/model_zoo/.gitignore
================================================
MODEL_ZOO


================================================
FILE: ppdet/model_zoo/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import model_zoo
from .model_zoo import *

__all__ = model_zoo.__all__


================================================
FILE: ppdet/model_zoo/model_zoo.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os.path as osp
import pkg_resources

try:
    from collections.abc import Sequence
except:
    from collections import Sequence

from ppdet.core.workspace import load_config, create
from ppdet.utils.checkpoint import load_weight
from ppdet.utils.download import get_config_path

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'list_model', 'get_config_file', 'get_weights_url', 'get_model',
    'MODEL_ZOO_FILENAME'
]

MODEL_ZOO_FILENAME = 'MODEL_ZOO'


def list_model(filters=[]):
    model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',
                                                     MODEL_ZOO_FILENAME)
    with open(model_zoo_file) as f:
        model_names = f.read().splitlines()

    # filter model_name
    def filt(name):
        for f in filters:
            if name.find(f) < 0:
                return False
        return True

    if isinstance(filters, str) or not isinstance(filters, Sequence):
        filters = [filters]
    model_names = [name for name in model_names if filt(name)]
    if len(model_names) == 0 and len(filters) > 0:
        raise ValueError("no model found, please check filters seeting, "
                         "filters can be set as following kinds:\n"
                         "\tDataset: coco, voc ...\n"
                         "\tArchitecture: yolo, rcnn, ssd ...\n"
                         "\tBackbone: resnet, vgg, darknet ...\n")

    model_str = "Available Models:\n"
    for model_name in model_names:
        model_str += "\t{}\n".format(model_name)
    logger.info(model_str)


# models and configs save on bcebos under dygraph directory
def get_config_file(model_name):
    return get_config_path("ppdet://configs/{}.yml".format(model_name))


def get_weights_url(model_name):
    return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1])


def get_model(model_name, pretrained=True):
    cfg_file = get_config_file(model_name)
    cfg = load_config(cfg_file)
    model = create(cfg.architecture)

    if pretrained:
        load_weight(model, get_weights_url(model_name))

    return model


================================================
FILE: ppdet/model_zoo/tests/__init__.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppdet/model_zoo/tests/test_get_model.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import paddle
import ppdet
import unittest

# NOTE: weights downloading costs time, we choose
#       a small model for unittesting
MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco'


class TestGetConfigFile(unittest.TestCase):
    def test_main(self):
        try:
            cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME)
            assert os.path.isfile(cfg_file)
        except:
            self.assertTrue(False)


class TestGetModel(unittest.TestCase):
    def test_main(self):
        try:
            model = ppdet.model_zoo.get_model(MODEL_NAME)
            assert isinstance(model, paddle.nn.Layer)
        except:
            self.assertTrue(False)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: ppdet/model_zoo/tests/test_list_model.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import unittest
import ppdet


class TestListModel(unittest.TestCase):
    def setUp(self):
        self._filter = []

    def test_main(self):
        try:
            ppdet.model_zoo.list_model(self._filter)
            self.assertTrue(True)
        except:
            self.assertTrue(False)


class TestListModelYOLO(TestListModel):
    def setUp(self):
        self._filter = ['yolo']


class TestListModelRCNN(TestListModel):
    def setUp(self):
        self._filter = ['rcnn']


class TestListModelSSD(TestListModel):
    def setUp(self):
        self._filter = ['ssd']


class TestListModelMultiFilter(TestListModel):
    def setUp(self):
        self._filter = ['yolo', 'darknet']


class TestListModelError(unittest.TestCase):
    def setUp(self):
        self._filter = ['xxx']

    def test_main(self):
        try:
            ppdet.model_zoo.list_model(self._filter)
            self.assertTrue(False)
        except ValueError:
            self.assertTrue(True)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: ppdet/modeling/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import warnings
warnings.filterwarnings(
    action='ignore', category=DeprecationWarning, module='ops')

from . import ops
from . import backbones
from . import necks
from . import proposal_generator
from . import heads
from . import losses
from . import architectures
from . import post_process
from . import layers
from . import reid
from . import mot
from . import transformers
from . import assigners
from . import rbox_utils
from . import ssod

from .ops import *
from .backbones import *
from .necks import *
from .proposal_generator import *
from .heads import *
from .losses import *
from .architectures import *
from .post_process import *
from .layers import *
from .reid import *
from .mot import *
from .transformers import *
from .assigners import *
from .rbox_utils import *
from .ssod import *


================================================
FILE: ppdet/modeling/architectures/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import meta_arch
from . import faster_rcnn
from . import mask_rcnn
from . import yolo
from . import ppyoloe
from . import cascade_rcnn
from . import ssd
from . import fcos
from . import solov2
from . import ttfnet
from . import s2anet
from . import keypoint_hrhrnet
from . import keypoint_hrnet
from . import keypoint_vitpose
from . import jde
from . import deepsort
from . import fairmot
from . import centernet
from . import gfl
from . import picodet
from . import detr
from . import sparse_rcnn
from . import tood
from . import retinanet
from . import bytetrack
from . import yolox
from . import yolof
from . import pose3d_metro
from . import centertrack
from . import queryinst
from . import detr_ssod
from . import multi_stream_detector
from . import clrnet

from .meta_arch import *
from .faster_rcnn import *
from .mask_rcnn import *
from .yolo import *
from .ppyoloe import *
from .cascade_rcnn import *
from .ssd import *
from .fcos import *
from .solov2 import *
from .ttfnet import *
from .s2anet import *
from .keypoint_hrhrnet import *
from .keypoint_hrnet import *
from .keypoint_vitpose import *
from .jde import *
from .deepsort import *
from .fairmot import *
from .centernet import *
from .blazeface import *
from .gfl import *
from .picodet import *
from .detr import *
from .sparse_rcnn import *
from .tood import *
from .retinanet import *
from .bytetrack import *
from .yolox import *
from .yolof import *
from .pose3d_metro import *
from .centertrack import *
from .queryinst import *
from .keypoint_petr import *
from .detr_ssod import *
from .multi_stream_detector import *
from .clrnet import *

from . import rtdetrv3
from .rtdetrv3 import *

================================================
FILE: ppdet/modeling/architectures/blazeface.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F

__all__ = ['BlazeFace']


@register
class BlazeFace(BaseArch):
    """
    BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,
               see https://arxiv.org/abs/1907.05047

    Args:
        backbone (nn.Layer): backbone instance
        neck (nn.Layer): neck instance
        blaze_head (nn.Layer): `blazeHead` instance
        post_process (object): `BBoxPostProcess` instance
    """

    __category__ = 'architecture'
    __inject__ = ['post_process']

    def __init__(self, backbone, blaze_head, neck, post_process):
        super(BlazeFace, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.blaze_head = blaze_head
        self.post_process = post_process

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)
        # head
        kwargs = {'input_shape': neck.out_shape}
        blaze_head = create(cfg['blaze_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            'blaze_head': blaze_head,
        }

    def _forward(self):
        # Backbone
        body_feats = self.backbone(self.inputs)
        # neck
        neck_feats = self.neck(body_feats)
        # blaze Head
        if self.training:
            return self.blaze_head(neck_feats, self.inputs['image'],
                                   self.inputs['gt_bbox'],
                                   self.inputs['gt_class'])
        else:
            preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
            bbox, bbox_num, nms_keep_idx = self.post_process(
                preds, anchors, self.inputs['im_shape'],
                self.inputs['scale_factor'])
            if self.use_extra_data:
                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
                """extra_data:{
                            'scores': predict scores,
                            'nms_keep_idx': bbox index before nms,
                           }
                           """
                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
                extra_data['scores'] = F.softmax(paddle.concat(
                    preds_logits, axis=1)).transpose([0, 2, 1])
                extra_data['logits'] = paddle.concat(
                    preds_logits, axis=1).transpose([0, 2, 1])
                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
                return bbox, bbox_num, extra_data
            else:
                return bbox, bbox_num

    def get_loss(self, ):
        return {"loss": self._forward()}

    def get_pred(self):
        if self.use_extra_data:
            bbox_pred, bbox_num, extra_data = self._forward()
            output = {
                "bbox": bbox_pred,
                "bbox_num": bbox_num,
                "extra_data": extra_data
            }
        else:
            bbox_pred, bbox_num = self._forward()
            output = {
                "bbox": bbox_pred,
                "bbox_num": bbox_num,
            }

        return output


================================================
FILE: ppdet/modeling/architectures/bytetrack.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['ByteTrack']


@register
class ByteTrack(BaseArch):
    """
    ByteTrack network, see https://arxiv.org/abs/2110.06864

    Args:
        detector (object): detector model instance
        reid (object): reid model instance, default None
        tracker (object): tracker instance
    """
    __category__ = 'architecture'

    def __init__(self,
                 detector='YOLOX',
                 reid=None,
                 tracker='JDETracker'):
        super(ByteTrack, self).__init__()
        self.detector = detector
        self.reid = reid
        self.tracker = tracker

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        detector = create(cfg['detector'])

        if cfg['reid'] != 'None':
            reid = create(cfg['reid'])
        else:
            reid = None

        tracker = create(cfg['tracker'])

        return {
            "detector": detector,
            "reid": reid,
            "tracker": tracker,
        }

    def _forward(self):
        det_outs = self.detector(self.inputs)

        if self.training:
            return det_outs
        else:
            if self.reid is not None:
                assert 'crops' in self.inputs
                crops = self.inputs['crops']
                pred_embs = self.reid(crops)
            else:
                pred_embs = None
            det_outs['embeddings'] = pred_embs
            return det_outs

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/cascade_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['CascadeRCNN']


@register
class CascadeRCNN(BaseArch):
    """
    Cascade R-CNN network, see https://arxiv.org/abs/1712.00726

    Args:
        backbone (object): backbone instance
        rpn_head (object): `RPNHead` instance
        bbox_head (object): `BBoxHead` instance
        bbox_post_process (object): `BBoxPostProcess` instance
        neck (object): 'FPN' instance
        mask_head (object): `MaskHead` instance
        mask_post_process (object): `MaskPostProcess` instance
    """
    __category__ = 'architecture'
    __inject__ = [
        'bbox_post_process',
        'mask_post_process',
    ]

    def __init__(self,
                 backbone,
                 rpn_head,
                 bbox_head,
                 bbox_post_process,
                 neck=None,
                 mask_head=None,
                 mask_post_process=None):
        super(CascadeRCNN, self).__init__()
        self.backbone = backbone
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.bbox_post_process = bbox_post_process
        self.neck = neck
        self.mask_head = mask_head
        self.mask_post_process = mask_post_process
        self.with_mask = mask_head is not None

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])
        kwargs = {'input_shape': backbone.out_shape}
        neck = cfg['neck'] and create(cfg['neck'], **kwargs)

        out_shape = neck and neck.out_shape or backbone.out_shape
        kwargs = {'input_shape': out_shape}
        rpn_head = create(cfg['rpn_head'], **kwargs)
        bbox_head = create(cfg['bbox_head'], **kwargs)

        out_shape = neck and out_shape or bbox_head.get_head().out_shape
        kwargs = {'input_shape': out_shape}
        mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)
        return {
            'backbone': backbone,
            'neck': neck,
            "rpn_head": rpn_head,
            "bbox_head": bbox_head,
            "mask_head": mask_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
            body_feats = self.neck(body_feats)

        if self.training:
            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
                                                  self.inputs)
            rois, rois_num = self.bbox_head.get_assigned_rois()
            bbox_targets = self.bbox_head.get_assigned_targets()
            if self.with_mask:
                mask_loss = self.mask_head(body_feats, rois, rois_num,
                                           self.inputs, bbox_targets, bbox_feat)
                return rpn_loss, bbox_loss, mask_loss
            else:
                return rpn_loss, bbox_loss, {}
        else:
            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
            preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)
            refined_rois = self.bbox_head.get_refined_rois()

            im_shape = self.inputs['im_shape']
            scale_factor = self.inputs['scale_factor']

            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
                preds, (refined_rois, rois_num), im_shape, scale_factor)
            # rescale the prediction back to origin image
            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
                bbox, bbox_num, im_shape, scale_factor)
            if not self.with_mask:
                return bbox_pred, bbox_num, None
            mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
            origin_shape = self.bbox_post_process.get_origin_shape()
            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
                                               origin_shape)
            return bbox_pred, bbox_num, mask_pred

    def get_loss(self, ):
        rpn_loss, bbox_loss, mask_loss = self._forward()
        loss = {}
        loss.update(rpn_loss)
        loss.update(bbox_loss)
        if self.with_mask:
            loss.update(mask_loss)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        bbox_pred, bbox_num, mask_pred = self._forward()
        output = {
            'bbox': bbox_pred,
            'bbox_num': bbox_num,
        }
        if self.with_mask:
            output.update({'mask': mask_pred})
        return output


================================================
FILE: ppdet/modeling/architectures/centernet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['CenterNet']


@register
class CenterNet(BaseArch):
    """
    CenterNet network, see http://arxiv.org/abs/1904.07850

    Args:
        backbone (object): backbone instance
        neck (object): FPN instance, default use 'CenterNetDLAFPN'
        head (object): 'CenterNetHead' instance
        post_process (object): 'CenterNetPostProcess' instance
        for_mot (bool): whether return other features used in tracking model

    """
    __category__ = 'architecture'
    __inject__ = ['post_process']
    __shared__ = ['for_mot']

    def __init__(self,
                 backbone,
                 neck='CenterNetDLAFPN',
                 head='CenterNetHead',
                 post_process='CenterNetPostProcess',
                 for_mot=False):
        super(CenterNet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.post_process = post_process
        self.for_mot = for_mot

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = cfg['neck'] and create(cfg['neck'], **kwargs)

        out_shape = neck and neck.out_shape or backbone.out_shape
        kwargs = {'input_shape': out_shape}
        head = create(cfg['head'], **kwargs)

        return {'backbone': backbone, 'neck': neck, "head": head}

    def _forward(self):
        neck_feat = self.backbone(self.inputs)
        if self.neck is not None:
            neck_feat = self.neck(neck_feat)
        head_out = self.head(neck_feat, self.inputs)
        if self.for_mot:
            head_out.update({'neck_feat': neck_feat})
        elif self.training:
            head_out['loss'] = head_out.pop('det_loss')
        return head_out

    def get_pred(self):
        head_out = self._forward()
        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
            head_out['heatmap'],
            head_out['size'],
            head_out['offset'],
            im_shape=self.inputs['im_shape'],
            scale_factor=self.inputs['scale_factor'])

        if self.for_mot:
            output = {
                "bbox": bbox,
                "bbox_num": bbox_num,
                "bbox_inds": bbox_inds,
                "topk_clses": topk_clses,
                "topk_ys": topk_ys,
                "topk_xs": topk_xs,
                "neck_feat": head_out['neck_feat']
            }
        else:
            output = {"bbox": bbox, "bbox_num": bbox_num}
        return output

    def get_loss(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/centertrack.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import math
import numpy as np
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

from ..keypoint_utils import affine_transform
from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian

__all__ = ['CenterTrack']


@register
class CenterTrack(BaseArch):
    """
    CenterTrack network, see http://arxiv.org/abs/2004.01177

    Args:
        detector (object): 'CenterNet' instance
        plugin_head (object): 'CenterTrackHead' instance
        tracker (object): 'CenterTracker' instance
    """
    __category__ = 'architecture'
    __shared__ = ['mot_metric']

    def __init__(self,
                 detector='CenterNet',
                 plugin_head='CenterTrackHead',
                 tracker='CenterTracker',
                 mot_metric=False):
        super(CenterTrack, self).__init__()
        self.detector = detector
        self.plugin_head = plugin_head
        self.tracker = tracker
        self.mot_metric = mot_metric
        self.pre_image = None
        self.deploy = False

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        detector = create(cfg['detector'])
        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape

        kwargs = {'input_shape': detector_out_shape}
        plugin_head = create(cfg['plugin_head'], **kwargs)
        tracker = create(cfg['tracker'])

        return {
            'detector': detector,
            'plugin_head': plugin_head,
            'tracker': tracker,
        }

    def _forward(self):
        if self.training:
            det_outs = self.detector(self.inputs)
            neck_feat = det_outs['neck_feat']

            losses = {}
            for k, v in det_outs.items():
                if 'loss' not in k: continue
                losses.update({k: v})

            plugin_outs = self.plugin_head(neck_feat, self.inputs)
            for k, v in plugin_outs.items():
                if 'loss' not in k: continue
                losses.update({k: v})

            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
            return losses

        else:
            if not self.mot_metric:
                # detection, support bs>=1
                det_outs = self.detector(self.inputs)
                return {
                    'bbox': det_outs['bbox'],
                    'bbox_num': det_outs['bbox_num']
                }

            else:
                # MOT, only support bs=1
                if not self.deploy:
                    if self.pre_image is None:
                        self.pre_image = self.inputs['image']
                        # initializing tracker for the first frame
                        self.tracker.init_track([])
                    self.inputs['pre_image'] = self.pre_image
                    self.pre_image = self.inputs[
                        'image']  # Note: update for next image

                    # render input heatmap from tracker status
                    pre_hm = self.get_additional_inputs(
                        self.tracker.tracks, self.inputs, with_hm=True)
                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)

                # model inference
                det_outs = self.detector(self.inputs)
                neck_feat = det_outs['neck_feat']
                result = self.plugin_head(
                    neck_feat, self.inputs, det_outs['bbox'],
                    det_outs['bbox_inds'], det_outs['topk_clses'],
                    det_outs['topk_ys'], det_outs['topk_xs'])

                if not self.deploy:
                    # convert the cropped and 4x downsampled output coordinate system
                    # back to the input image coordinate system
                    result = self.plugin_head.centertrack_post_process(
                        result, self.inputs, self.tracker.out_thresh)
                return result

    def get_pred(self):
        return self._forward()

    def get_loss(self):
        return self._forward()

    def reset_tracking(self):
        self.tracker.reset()
        self.pre_image = None

    def get_additional_inputs(self, dets, meta, with_hm=True):
        # Render input heatmap from previous trackings.
        trans_input = meta['trans_input'][0].numpy()
        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
            'inp_height'][0])
        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)

        for det in dets:
            if det['score'] < self.tracker.pre_thresh:
                continue
            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
                                         inp_height)
            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
            if (h > 0 and w > 0):
                radius = gaussian_radius(
                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)
                radius = max(0, int(radius))
                ct = np.array(
                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                    dtype=np.float32)
                ct_int = ct.astype(np.int32)
                if with_hm:
                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
                                                      radius)
        if with_hm:
            input_hm = input_hm[np.newaxis]
        return input_hm


def affine_transform_bbox(bbox, trans, width, height):
    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
    bbox[:2] = affine_transform(bbox[:2], trans)
    bbox[2:] = affine_transform(bbox[2:], trans)
    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
    return bbox


================================================
FILE: ppdet/modeling/architectures/clrnet.py
================================================
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
from paddle import in_dynamic_mode

__all__ = ['CLRNet']


@register
class CLRNet(BaseArch):
    __category__ = 'architecture'

    def __init__(self,
                 backbone="CLRResNet",
                 neck="CLRFPN",
                 clr_head="CLRHead",
                 post_process=None):
        super(CLRNet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.heads = clr_head
        self.post_process = post_process

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)
        # head
        kwargs = {'input_shape': neck.out_shape}
        clr_head = create(cfg['clr_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            'clr_head': clr_head,
        }

    def _forward(self):
        # Backbone
        body_feats = self.backbone(self.inputs['image'])
        # neck
        neck_feats = self.neck(body_feats)
        # CRL Head

        if self.training:
            output = self.heads(neck_feats, self.inputs)
        else:
            output = self.heads(neck_feats)
            output = {'lanes': output}
            # TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode
            if in_dynamic_mode():
                output = self.heads.get_lanes(output['lanes'])
                output = {
                    "lanes": output,
                    "img_path": self.inputs['full_img_path'],
                    "img_name": self.inputs['img_name']
                }

        return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/deepsort.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box

__all__ = ['DeepSORT']


@register
class DeepSORT(BaseArch):
    """
    DeepSORT network, see https://arxiv.org/abs/1703.07402

    Args:
        detector (object): detector model instance
        reid (object): reid model instance
        tracker (object): tracker instance
    """
    __category__ = 'architecture'

    def __init__(self,
                 detector='YOLOv3',
                 reid='PCBPyramid',
                 tracker='DeepSORTTracker'):
        super(DeepSORT, self).__init__()
        self.detector = detector
        self.reid = reid
        self.tracker = tracker

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        if cfg['detector'] != 'None':
            detector = create(cfg['detector'])
        else:
            detector = None
        reid = create(cfg['reid'])
        tracker = create(cfg['tracker'])

        return {
            "detector": detector,
            "reid": reid,
            "tracker": tracker,
        }

    def _forward(self):
        crops = self.inputs['crops']
        outs = {}
        outs['embeddings'] = self.reid(crops)
        return outs

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/detr.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create

__all__ = ['DETR']
# Deformable DETR, DINO use the same architecture as DETR


@register
class DETR(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['post_process', 'post_process_semi']
    __shared__ = ['with_mask', 'exclude_post_process']

    def __init__(self,
                 backbone,
                 transformer='DETRTransformer',
                 detr_head='DETRHead',
                 neck=None,
                 post_process='DETRPostProcess',
                 post_process_semi=None,
                 with_mask=False,
                 exclude_post_process=False):
        super(DETR, self).__init__()
        self.backbone = backbone
        self.transformer = transformer
        self.detr_head = detr_head
        self.neck = neck
        self.post_process = post_process
        self.with_mask = with_mask
        self.exclude_post_process = exclude_post_process
        self.post_process_semi = post_process_semi

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        # neck
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None

        # transformer
        if neck is not None:
            kwargs = {'input_shape': neck.out_shape}
        transformer = create(cfg['transformer'], **kwargs)
        # head
        kwargs = {
            'hidden_dim': transformer.hidden_dim,
            'nhead': transformer.nhead,
            'input_shape': backbone.out_shape
        }
        detr_head = create(cfg['detr_head'], **kwargs)

        return {
            'backbone': backbone,
            'transformer': transformer,
            "detr_head": detr_head,
            "neck": neck
        }

    def _forward(self):
        # Backbone
        body_feats = self.backbone(self.inputs)

        # Neck
        if self.neck is not None:
            body_feats = self.neck(body_feats)

        # Transformer
        pad_mask = self.inputs.get('pad_mask', None)
        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)

        # DETR Head
        if self.training:
            detr_losses = self.detr_head(out_transformer, body_feats,
                                         self.inputs)
            detr_losses.update({
                'loss': paddle.add_n(
                    [v for k, v in detr_losses.items() if 'log' not in k])
            })
            return detr_losses
        else:
            preds = self.detr_head(out_transformer, body_feats)
            if self.exclude_post_process:
                bbox, bbox_num, mask = preds
            else:
                bbox, bbox_num, mask = self.post_process(
                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
                    self.inputs['image'][2:].shape)

            output = {'bbox': bbox, 'bbox_num': bbox_num}
            if self.with_mask:
                output['mask'] = mask
            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/detr_ssod.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create, merge_config
import paddle

import numpy as np
import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register, create
from ppdet.utils.logger import setup_logger
from ppdet.modeling.ssod.utils import filter_invalid
from .multi_stream_detector import MultiSteamDetector
logger = setup_logger(__name__)

__all__ = ['DETR_SSOD']
__shared__ = ['num_classes']


@register
class DETR_SSOD(MultiSteamDetector):
    def __init__(self,
                 teacher,
                 student,
                 train_cfg=None,
                 test_cfg=None,
                 RTDETRTransformer=None,
                 num_classes=80):
        super(DETR_SSOD, self).__init__(
            dict(
                teacher=teacher, student=student),
            train_cfg=train_cfg,
            test_cfg=test_cfg, )
        self.ema_start_iters = train_cfg['ema_start_iters']
        self.momentum = 0.9996
        self.cls_thr = None
        self.cls_thr_ig = None
        self.num_classes = num_classes
        if train_cfg is not None:
            self.freeze("teacher")
            self.unsup_weight = self.train_cfg['unsup_weight']
            self.sup_weight = self.train_cfg['sup_weight']
            self._teacher = None
            self._student = None
            self._transformer = None

    @classmethod
    def from_config(cls, cfg):
        teacher = create(cfg['teacher'])
        merge_config(cfg)
        student = create(cfg['student'])
        train_cfg = cfg['train_cfg']
        test_cfg = cfg['test_cfg']
        RTDETRTransformer = cfg['RTDETRTransformer']
        return {
            'teacher': teacher,
            'student': student,
            'train_cfg': train_cfg,
            'test_cfg': test_cfg,
            'RTDETRTransformer': RTDETRTransformer
        }

    def forward_train(self, inputs, **kwargs):
        if isinstance(inputs, dict):
            iter_id = inputs['iter_id']
        elif isinstance(inputs, list):
            iter_id = inputs[-1]
        if iter_id == self.ema_start_iters:
            self.update_ema_model(momentum=0)
        elif iter_id > self.ema_start_iters:
            self.update_ema_model(momentum=self.momentum)
        if iter_id > self.ema_start_iters:
            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs

            if data_sup_w['image'].shape != data_sup_s['image'].shape:
                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
                                                                 data_sup_s)

            if 'gt_bbox' in data_unsup_s.keys():
                del data_unsup_s['gt_bbox']
            if 'gt_class' in data_unsup_s.keys():
                del data_unsup_s['gt_class']
            if 'gt_class' in data_unsup_w.keys():
                del data_unsup_w['gt_class']
            if 'gt_bbox' in data_unsup_w.keys():
                del data_unsup_w['gt_bbox']
            for k, v in data_sup_s.items():
                if k in ['epoch_id']:
                    continue
                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
                    data_sup_s[k].extend(data_sup_w[k])
                else:
                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])

            loss = {}
            body_feats = self.student.backbone(data_sup_s)
            if self.student.neck is not None:
                body_feats = self.student.neck(body_feats)
            out_transformer = self.student.transformer(body_feats, None,
                                                       data_sup_s)
            sup_loss = self.student.detr_head(out_transformer, body_feats,
                                              data_sup_s)
            sup_loss.update({
                'loss': paddle.add_n(
                    [v for k, v in sup_loss.items() if 'log' not in k])
            })
            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}

            loss.update(**sup_loss)
            unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s)
            unsup_loss.update({
                'loss': paddle.add_n(
                    [v for k, v in unsup_loss.items() if 'log' not in k])
            })
            unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()}
            unsup_loss.update({
                'loss': paddle.add_n(
                    [v for k, v in unsup_loss.items() if 'log' not in k])
            })
            loss.update(**unsup_loss)
            loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']})
        else:
            if iter_id == self.ema_start_iters:
                logger.info("start semi_supervised_traing")
            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs

            if data_sup_w['image'].shape != data_sup_s['image'].shape:
                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
                                                                 data_sup_s)
            for k, v in data_sup_s.items():
                if k in ['epoch_id']:
                    continue
                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
                    data_sup_s[k].extend(data_sup_w[k])
                else:
                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
            loss = {}
            sup_loss = self.student(data_sup_s)
            unsup_loss = {
                "unsup_" + k: v * paddle.to_tensor(0)
                for k, v in sup_loss.items()
            }
            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
            loss.update(**sup_loss)
            unsup_loss.update({
                'loss': paddle.add_n(
                    [v * 0 for k, v in sup_loss.items() if 'log' not in k])
            })
            unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()}
            loss.update(**unsup_loss)
            loss.update({'loss': loss['sup_loss']})
        return loss

    def foward_unsup_train(self, data_unsup_w, data_unsup_s):

        with paddle.no_grad():
            body_feats = self.teacher.backbone(data_unsup_w)
            if self.teacher.neck is not None:
                body_feats = self.teacher.neck(body_feats, is_teacher=True)
            out_transformer = self.teacher.transformer(
                body_feats, None, data_unsup_w, is_teacher=True)
            preds = self.teacher.detr_head(out_transformer, body_feats)
            bbox, bbox_num = self.teacher.post_process_semi(preds)
        self.place = body_feats[0].place

        proposal_bbox_list = bbox[:, -4:]
        proposal_bbox_list = proposal_bbox_list.split(
            tuple(np.array(bbox_num)), 0)

        proposal_label_list = paddle.cast(bbox[:, :1], np.float32)
        proposal_label_list = proposal_label_list.split(
            tuple(np.array(bbox_num)), 0)
        proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1],
                                          np.float32)
        proposal_score_list = proposal_score_list.split(
            tuple(np.array(bbox_num)), 0)
        proposal_bbox_list = [
            paddle.to_tensor(
                p, place=self.place) for p in proposal_bbox_list
        ]
        proposal_label_list = [
            paddle.to_tensor(
                p, place=self.place) for p in proposal_label_list
        ]
        # filter invalid box roughly
        if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float):
            thr = self.train_cfg['pseudo_label_initial_score_thr']
        else:
            # TODO: use dynamic threshold
            raise NotImplementedError(
                "Dynamic Threshold is not implemented yet.")
        proposal_bbox_list, proposal_label_list, proposal_score_list = list(
            zip(* [
                filter_invalid(
                    proposal[:, :4],
                    proposal_label,
                    proposal_score,
                    thr=thr,
                    min_size=self.train_cfg['min_pseduo_box_size'], )
                for proposal, proposal_label, proposal_score in
                zip(proposal_bbox_list, proposal_label_list,
                    proposal_score_list)
            ]))

        teacher_bboxes = list(proposal_bbox_list)
        teacher_labels = proposal_label_list
        teacher_info = [teacher_bboxes, teacher_labels]
        student_unsup = data_unsup_s
        return self.compute_pseudo_label_loss(student_unsup, teacher_info,
                                              proposal_score_list)

    def compute_pseudo_label_loss(self, student_unsup, teacher_info,
                                  proposal_score_list):

        pseudo_bboxes = list(teacher_info[0])
        pseudo_labels = list(teacher_info[1])
        losses = dict()
        for i in range(len(pseudo_bboxes)):
            if pseudo_labels[i].shape[0] == 0:
                pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy()
                pseudo_labels[i] = paddle.zeros([0, 1]).numpy()
            else:
                pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy()
                pseudo_labels[i] = pseudo_labels[i].numpy()
        for i in range(len(pseudo_bboxes)):
            pseudo_labels[i] = paddle.to_tensor(
                pseudo_labels[i], dtype=paddle.int32, place=self.place)
            pseudo_bboxes[i] = paddle.to_tensor(
                pseudo_bboxes[i], dtype=paddle.float32, place=self.place)
        student_unsup.update({
            'gt_bbox': pseudo_bboxes,
            'gt_class': pseudo_labels
        })
        pseudo_sum = 0
        for i in range(len(pseudo_bboxes)):
            pseudo_sum += pseudo_bboxes[i].sum()
        if pseudo_sum == 0:  #input fake data when there are no pseudo labels
            pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5
            pseudo_labels[0] = paddle.ones([1, 1]).astype('int32')
            student_unsup.update({
                'gt_bbox': pseudo_bboxes,
                'gt_class': pseudo_labels
            })
            body_feats = self.student.backbone(student_unsup)
            if self.student.neck is not None:
                body_feats = self.student.neck(body_feats)
            out_transformer = self.student.transformer(body_feats, None,
                                                       student_unsup)
            losses = self.student.detr_head(out_transformer, body_feats,
                                            student_unsup)
            for n, v in losses.items():
                losses[n] = v * 0
        else:
            gt_bbox = []
            gt_class = []
            images = []
            proposal_score = []
            for i in range(len(pseudo_bboxes)):
                if pseudo_labels[i].shape[0] == 0:
                    continue
                else:
                    proposal_score.append(proposal_score_list[i].max(-1)
                                          .unsqueeze(-1))
                    gt_class.append(pseudo_labels[i])
                    gt_bbox.append(pseudo_bboxes[i])
                    images.append(student_unsup['image'][i])
            images = paddle.stack(images)
            student_unsup.update({
                'image': images,
                'gt_bbox': gt_bbox,
                'gt_class': gt_class
            })
            body_feats = self.student.backbone(student_unsup)
            if self.student.neck is not None:
                body_feats = self.student.neck(body_feats)
            out_transformer = self.student.transformer(body_feats, None,
                                                       student_unsup)
            student_unsup.update({'gt_score': proposal_score})
            losses = self.student.detr_head(out_transformer, body_feats,
                                            student_unsup)
        return losses


def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return paddle.stack(b, axis=-1)


def box_xyxy_to_cxcywh(x):
    x0, y0, x1, y1 = x.unbind(-1)
    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
    return paddle.stack(b, axis=-1)


def get_size_with_aspect_ratio(image_size, size, max_size=None):
    w, h = image_size
    if max_size is not None:
        min_original_size = float(min((w, h)))
        max_original_size = float(max((w, h)))
        if max_original_size / min_original_size * size > max_size:
            size = int(round(max_size * min_original_size / max_original_size))

    if (w <= h and w == size) or (h <= w and h == size):
        return (w, h)

    if w < h:
        ow = size
        oh = int(size * h / w)
    else:
        oh = size
        ow = int(size * w / h)

    return (ow, oh)


def align_weak_strong_shape(data_weak, data_strong):
    shape_x = data_strong['image'].shape[2]
    shape_y = data_strong['image'].shape[3]

    target_size = [shape_x, shape_y]
    data_weak['image'] = F.interpolate(
        data_weak['image'],
        size=target_size,
        mode='bilinear',
        align_corners=False)
    return data_weak, data_strong


================================================
FILE: ppdet/modeling/architectures/fairmot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['FairMOT']


@register
class FairMOT(BaseArch):
    """
    FairMOT network, see http://arxiv.org/abs/2004.01888

    Args:
        detector (object): 'CenterNet' instance
        reid (object): 'FairMOTEmbeddingHead' instance
        tracker (object): 'JDETracker' instance
        loss (object): 'FairMOTLoss' instance

    """

    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(self,
                 detector='CenterNet',
                 reid='FairMOTEmbeddingHead',
                 tracker='JDETracker',
                 loss='FairMOTLoss'):
        super(FairMOT, self).__init__()
        self.detector = detector
        self.reid = reid
        self.tracker = tracker
        self.loss = loss

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        detector = create(cfg['detector'])
        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape

        kwargs = {'input_shape': detector_out_shape}
        reid = create(cfg['reid'], **kwargs)
        loss = create(cfg['loss'])
        tracker = create(cfg['tracker'])

        return {
            'detector': detector,
            'reid': reid,
            'loss': loss,
            'tracker': tracker
        }

    def _forward(self):
        loss = dict()
        # det_outs keys:
        # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)
        # eval/infer: neck_feat, bbox, bbox_inds
        det_outs = self.detector(self.inputs)
        neck_feat = det_outs['neck_feat']
        if self.training:
            reid_loss = self.reid(neck_feat, self.inputs)

            det_loss = det_outs['det_loss']
            loss = self.loss(det_loss, reid_loss)
            for k, v in det_outs.items():
                if 'loss' not in k:
                    continue
                loss.update({k: v})
            loss.update({'reid_loss': reid_loss})
            return loss
        else:
            pred_dets, pred_embs = self.reid(
                neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'],
                det_outs['topk_clses'])
            return pred_dets, pred_embs

    def get_pred(self):
        output = self._forward()
        return output

    def get_loss(self):
        loss = self._forward()
        return loss


================================================
FILE: ppdet/modeling/architectures/faster_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import numpy as np

__all__ = ['FasterRCNN']


@register
class FasterRCNN(BaseArch):
    """
    Faster R-CNN network, see https://arxiv.org/abs/1506.01497

    Args:
        backbone (object): backbone instance
        rpn_head (object): `RPNHead` instance
        bbox_head (object): `BBoxHead` instance
        bbox_post_process (object): `BBoxPostProcess` instance
        neck (object): 'FPN' instance
    """
    __category__ = 'architecture'
    __inject__ = ['bbox_post_process']

    def __init__(self,
                 backbone,
                 rpn_head,
                 bbox_head,
                 bbox_post_process,
                 neck=None):
        super(FasterRCNN, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.bbox_post_process = bbox_post_process

    def init_cot_head(self, relationship):
        self.bbox_head.init_cot_head(relationship)

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])
        kwargs = {'input_shape': backbone.out_shape}
        neck = cfg['neck'] and create(cfg['neck'], **kwargs)

        out_shape = neck and neck.out_shape or backbone.out_shape
        kwargs = {'input_shape': out_shape}
        rpn_head = create(cfg['rpn_head'], **kwargs)
        bbox_head = create(cfg['bbox_head'], **kwargs)
        return {
            'backbone': backbone,
            'neck': neck,
            "rpn_head": rpn_head,
            "bbox_head": bbox_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
            body_feats = self.neck(body_feats)
        if self.training:
            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
                                          self.inputs)
            return rpn_loss, bbox_loss
        else:
            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
            im_shape = self.inputs['im_shape']
            scale_factor = self.inputs['scale_factor']
            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
                preds, (rois, rois_num), im_shape, scale_factor)

            # rescale the prediction back to origin image
            bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
                bbox, bbox_num, im_shape, scale_factor)

            if self.use_extra_data:
                extra_data = {
                }  # record the bbox output before nms, such like scores and nms_keep_idx
                """extra_data:{
                            'scores': predict scores,
                            'nms_keep_idx': bbox index before nms,
                           }
                """
                extra_data['scores'] = preds[1]  # predict scores (probability)
                # Todo: get logits output
                extra_data[
                    'nms_keep_idx'] = nms_keep_idx  # bbox index before nms
                return bbox_pred, bbox_num, extra_data
            else:
                return bbox_pred, bbox_num

    def get_loss(self, ):
        rpn_loss, bbox_loss = self._forward()
        loss = {}
        loss.update(rpn_loss)
        loss.update(bbox_loss)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        if self.use_extra_data:
            bbox_pred, bbox_num, extra_data = self._forward()
            output = {
                'bbox': bbox_pred,
                'bbox_num': bbox_num,
                'extra_data': extra_data
            }
        else:
            bbox_pred, bbox_num = self._forward()
            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output

    def target_bbox_forward(self, data):
        body_feats = self.backbone(data)
        if self.neck is not None:
            body_feats = self.neck(body_feats)
        rois = [roi for roi in data['gt_bbox']]
        rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois])

        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
        return preds

    def relationship_learning(self, loader, num_classes_novel):
        print('computing relationship')
        train_labels_list = []
        label_list = []

        for step_id, data in enumerate(loader):
            _, bbox_prob = self.target_bbox_forward(data)
            batch_size = data['im_id'].shape[0]
            for i in range(batch_size):
                num_bbox = data['gt_class'][i].shape[0]
                train_labels = data['gt_class'][i]
                train_labels_list.append(train_labels.numpy().squeeze(1))
            base_labels = bbox_prob.detach().numpy()[:, :-1]
            label_list.append(base_labels)

        labels = np.concatenate(train_labels_list, 0)
        probabilities = np.concatenate(label_list, 0)
        N_t = np.max(labels) + 1
        conditional = []
        for i in range(N_t):
            this_class = probabilities[labels == i]
            average = np.mean(this_class, axis=0, keepdims=True)
            conditional.append(average)
        return np.concatenate(conditional)


================================================
FILE: ppdet/modeling/architectures/fcos.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['FCOS', 'ARSL_FCOS']


@register
class FCOS(BaseArch):
    """
    FCOS network, see https://arxiv.org/abs/1904.01355

    Args:
        backbone (object): backbone instance
        neck (object): 'FPN' instance
        fcos_head (object): 'FCOSHead' instance
        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher
    """

    __category__ = 'architecture'
    __inject__ = ['ssod_loss']

    def __init__(self,
                 backbone='ResNet',
                 neck='FPN',
                 fcos_head='FCOSHead',
                 ssod_loss='SSODFCOSLoss'):
        super(FCOS, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.fcos_head = fcos_head

        # for ssod, semi-det
        self.is_teacher = False
        self.ssod_loss = ssod_loss

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        fcos_head = create(cfg['fcos_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "fcos_head": fcos_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)

        self.is_teacher = self.inputs.get('is_teacher', False)
        if self.training or self.is_teacher:
            losses = self.fcos_head(fpn_feats, self.inputs)
            return losses
        else:
            fcos_head_outs = self.fcos_head(fpn_feats)
            bbox_pred, bbox_num = self.fcos_head.post_process(
                fcos_head_outs, self.inputs['scale_factor'])
            return {'bbox': bbox_pred, 'bbox_num': bbox_num}

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()

    def get_loss_keys(self):
        return ['loss_cls', 'loss_box', 'loss_quality']

    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
                                     train_cfg)
        return ssod_losses


@register
class ARSL_FCOS(BaseArch):
    """
    FCOS ARSL network, see https://arxiv.org/abs/

    Args:
        backbone (object): backbone instance
        neck (object): 'FPN' instance
        fcos_head (object): 'FCOSHead_ARSL' instance
        fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL
    """

    __category__ = 'architecture'
    __inject__ = ['fcos_cr_loss']

    def __init__(self,
                 backbone,
                 neck,
                 fcos_head='FCOSHead_ARSL',
                 fcos_cr_loss='FCOSLossCR'):
        super(ARSL_FCOS, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.fcos_head = fcos_head
        self.fcos_cr_loss = fcos_cr_loss

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        fcos_head = create(cfg['fcos_head'], **kwargs)

        # consistency regularization loss
        fcos_cr_loss = create(cfg['fcos_cr_loss'])

        return {
            'backbone': backbone,
            'neck': neck,
            'fcos_head': fcos_head,
            'fcos_cr_loss': fcos_cr_loss,
        }

    def forward(self, inputs, branch="supervised", teacher_prediction=None):
        assert branch in ['supervised', 'semi_supervised'], \
            print('In ARSL, type must be supervised or semi_supervised.')

        if self.data_format == 'NHWC':
            image = inputs['image']
            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
        self.inputs = inputs

        if self.training:
            if branch == "supervised":
                out = self.get_loss()
            else:
                out = self.get_pseudo_loss(teacher_prediction)
        else:
            # norm test
            if branch == "supervised":
                out = self.get_pred()
                # predict pseudo labels
            else:
                out = self.get_pseudo_pred()
        return out

    # model forward 
    def model_forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)
        fcos_head_outs = self.fcos_head(fpn_feats)
        return fcos_head_outs

    # supervised loss for labeled data
    def get_loss(self):
        loss = {}
        tag_labels, tag_bboxes, tag_centerness = [], [], []
        for i in range(len(self.fcos_head.fpn_stride)):
            # labels, reg_target, centerness
            k_lbl = 'labels{}'.format(i)
            if k_lbl in self.inputs:
                tag_labels.append(self.inputs[k_lbl])
            k_box = 'reg_target{}'.format(i)
            if k_box in self.inputs:
                tag_bboxes.append(self.inputs[k_box])
            k_ctn = 'centerness{}'.format(i)
            if k_ctn in self.inputs:
                tag_centerness.append(self.inputs[k_ctn])
        fcos_head_outs = self.model_forward()
        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
                                            tag_bboxes, tag_centerness)
        loss.update(loss_fcos)
        return loss

    # unsupervised loss for unlabeled data
    def get_pseudo_loss(self, teacher_prediction):
        loss = {}
        fcos_head_outs = self.model_forward()
        unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction)
        for k in unsup_loss.keys():
            loss[k + '_pseudo'] = unsup_loss[k]
        return loss

    # get detection results for test, decode and rescale the results to original size
    def get_pred(self):
        fcos_head_outs = self.model_forward()
        scale_factor = self.inputs['scale_factor']
        bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs,
                                                          scale_factor)
        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output

    # generate pseudo labels to guide student
    def get_pseudo_pred(self):
        fcos_head_outs = self.model_forward()
        pred_cls, pred_loc, pred_iou = fcos_head_outs[1:]  # 0 is locations
        for lvl, _ in enumerate(pred_loc):
            pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl]

        return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride]


================================================
FILE: ppdet/modeling/architectures/gfl.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['GFL']


@register
class GFL(BaseArch):
    """
    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388

    Args:
        backbone (object): backbone instance
        neck (object): 'FPN' instance
        head (object): 'GFLHead' instance
    """

    __category__ = 'architecture'

    def __init__(self, backbone, neck, head='GFLHead'):
        super(GFL, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)
        head_outs = self.head(fpn_feats)
        if not self.training:
            im_shape = self.inputs['im_shape']
            scale_factor = self.inputs['scale_factor']
            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
                                                      scale_factor)
            return bboxes, bbox_num
        else:
            return head_outs

    def get_loss(self, ):
        loss = {}

        head_outs = self._forward()
        loss_gfl = self.head.get_loss(head_outs, self.inputs)
        loss.update(loss_gfl)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output


================================================
FILE: ppdet/modeling/architectures/jde.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['JDE']


@register
class JDE(BaseArch):
    __category__ = 'architecture'
    __shared__ = ['metric']
    """
    JDE network, see https://arxiv.org/abs/1909.12605v1

    Args:
        detector (object): detector model instance
        reid (object): reid model instance
        tracker (object): tracker instance
        metric (str): 'MOTDet' for training and detection evaluation, 'ReID'
            for ReID embedding evaluation, or 'MOT' for multi object tracking
            evaluation.
    """

    def __init__(self,
                 detector='YOLOv3',
                 reid='JDEEmbeddingHead',
                 tracker='JDETracker',
                 metric='MOT'):
        super(JDE, self).__init__()
        self.detector = detector
        self.reid = reid
        self.tracker = tracker
        self.metric = metric

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        detector = create(cfg['detector'])
        kwargs = {'input_shape': detector.neck.out_shape}

        reid = create(cfg['reid'], **kwargs)

        tracker = create(cfg['tracker'])

        return {
            "detector": detector,
            "reid": reid,
            "tracker": tracker,
        }

    def _forward(self):
        det_outs = self.detector(self.inputs)

        if self.training:
            emb_feats = det_outs['emb_feats']
            loss_confs = det_outs['det_losses']['loss_confs']
            loss_boxes = det_outs['det_losses']['loss_boxes']
            jde_losses = self.reid(
                emb_feats,
                self.inputs,
                loss_confs=loss_confs,
                loss_boxes=loss_boxes)
            return jde_losses
        else:
            if self.metric == 'MOTDet':
                det_results = {
                    'bbox': det_outs['bbox'],
                    'bbox_num': det_outs['bbox_num'],
                }
                return det_results

            elif self.metric == 'MOT':
                emb_feats = det_outs['emb_feats']
                bboxes = det_outs['bbox']
                boxes_idx = det_outs['boxes_idx']
                nms_keep_idx = det_outs['nms_keep_idx']

                pred_dets, pred_embs = self.reid(
                    emb_feats,
                    self.inputs,
                    bboxes=bboxes,
                    boxes_idx=boxes_idx,
                    nms_keep_idx=nms_keep_idx)
                return pred_dets, pred_embs

            else:
                raise ValueError("Unknown metric {} for multi object tracking.".
                                 format(self.metric))

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/keypoint_hrhrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from scipy.optimize import linear_sum_assignment
from collections import abc, defaultdict
import numpy as np
import paddle

from ppdet.core.workspace import register, create, serializable
from .meta_arch import BaseArch
from .. import layers as L
from ..keypoint_utils import transpred

__all__ = ['HigherHRNet']


@register
class HigherHRNet(BaseArch):
    __category__ = 'architecture'

    def __init__(self,
                 backbone='HRNet',
                 hrhrnet_head='HrHRNetHead',
                 post_process='HrHRNetPostProcess',
                 eval_flip=True,
                 flip_perm=None,
                 max_num_people=30):
        """
        HigherHRNet network, see https://arxiv.org/abs/1908.10357；
        HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175

        Args:
            backbone (nn.Layer): backbone instance
            hrhrnet_head (nn.Layer): keypoint_head instance
            bbox_post_process (object): `BBoxPostProcess` instance
        """
        super(HigherHRNet, self).__init__()
        self.backbone = backbone
        self.hrhrnet_head = hrhrnet_head
        self.post_process = post_process
        self.flip = eval_flip
        self.flip_perm = paddle.to_tensor(flip_perm)
        self.deploy = False
        self.interpolate = L.Upsample(2, mode='bilinear')
        self.pool = L.MaxPool(5, 1, 2)
        self.max_num_people = max_num_people

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        # head
        kwargs = {'input_shape': backbone.out_shape}
        hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)
        post_process = create(cfg['post_process'])

        return {
            'backbone': backbone,
            "hrhrnet_head": hrhrnet_head,
            "post_process": post_process,
        }

    def _forward(self):
        if self.flip and not self.training and not self.deploy:
            self.inputs['image'] = paddle.concat(
                (self.inputs['image'], paddle.flip(self.inputs['image'], [3])))
        body_feats = self.backbone(self.inputs)

        if self.training:
            return self.hrhrnet_head(body_feats, self.inputs)
        else:
            outputs = self.hrhrnet_head(body_feats)

            if self.flip and not self.deploy:
                outputs = [paddle.split(o, 2) for o in outputs]
                output_rflip = [
                    paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])
                    for o in outputs
                ]
                output1 = [o[0] for o in outputs]
                heatmap = (output1[0] + output_rflip[0]) / 2.
                tagmaps = [output1[1], output_rflip[1]]
                outputs = [heatmap] + tagmaps
            outputs = self.get_topk(outputs)

            if self.deploy:
                return outputs

            res_lst = []
            h = self.inputs['im_shape'][0, 0].numpy().item()
            w = self.inputs['im_shape'][0, 1].numpy().item()
            kpts, scores = self.post_process(*outputs, h, w)
            res_lst.append([kpts, scores])
            return res_lst

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        outputs = {}
        res_lst = self._forward()
        outputs['keypoint'] = res_lst
        return outputs

    def get_topk(self, outputs):
        # resize to image size
        outputs = [self.interpolate(x) for x in outputs]
        if len(outputs) == 3:
            tagmap = paddle.concat(
                (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)
        else:
            tagmap = outputs[1].unsqueeze(4)

        heatmap = outputs[0]
        N, J = 1, self.hrhrnet_head.num_joints
        heatmap_maxpool = self.pool(heatmap)
        # topk
        maxmap = heatmap * (heatmap == heatmap_maxpool)
        maxmap = maxmap.reshape([N, J, -1])
        heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)

        outputs = [heatmap, tagmap, heat_k, inds_k]
        return outputs


@register
@serializable
class HrHRNetPostProcess(object):
    '''
    HrHRNet postprocess contain:
        1) get topk keypoints in the output heatmap
        2) sample the tagmap's value corresponding to each of the topk coordinate
        3) match different joints to combine to some people with Hungary algorithm
        4) adjust the coordinate by +-0.25 to decrease error std
        5) salvage missing joints by check positivity of heatmap - tagdiff_norm
    Args:
        max_num_people (int): max number of people support in postprocess
        heat_thresh (float): value of topk below this threshhold will be ignored
        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init

        inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
        original_height, original_width (float): the original image size
    '''

    def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):
        self.max_num_people = max_num_people
        self.heat_thresh = heat_thresh
        self.tag_thresh = tag_thresh

    def lerp(self, j, y, x, heatmap):
        H, W = heatmap.shape[-2:]
        left = np.clip(x - 1, 0, W - 1)
        right = np.clip(x + 1, 0, W - 1)
        up = np.clip(y - 1, 0, H - 1)
        down = np.clip(y + 1, 0, H - 1)
        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
                            -0.25)
        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
                            -0.25)
        return offset_y + 0.5, offset_x + 0.5

    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
                 original_width):

        N, J, H, W = heatmap.shape
        assert N == 1, "only support batch size 1"
        heatmap = heatmap[0].cpu().detach().numpy()
        tagmap = tagmap[0].cpu().detach().numpy()
        heats = heat_k[0].cpu().detach().numpy()
        inds_np = inds_k[0].cpu().detach().numpy()
        y = inds_np // W
        x = inds_np % W
        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
                      y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
        coords = np.stack((y, x), axis=2)
        # threshold
        mask = heats > self.heat_thresh
        # cluster
        cluster = defaultdict(lambda: {
            'coords': np.zeros((J, 2), dtype=np.float32),
            'scores': np.zeros(J, dtype=np.float32),
            'tags': []
        })
        for jid, m in enumerate(mask):
            num_valid = m.sum()
            if num_valid == 0:
                continue
            valid_inds = np.where(m)[0]
            valid_tags = tags[jid, m, :]
            if len(cluster) == 0:  # initialize
                for i in valid_inds:
                    tag = tags[jid, i]
                    key = tag[0]
                    cluster[key]['tags'].append(tag)
                    cluster[key]['scores'][jid] = heats[jid, i]
                    cluster[key]['coords'][jid] = coords[jid, i]
                continue
            candidates = list(cluster.keys())[:self.max_num_people]
            centroids = [
                np.mean(
                    cluster[k]['tags'], axis=0) for k in candidates
            ]
            num_clusters = len(centroids)
            # shape is (num_valid, num_clusters, tag_dim)
            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
            l2_dist = np.linalg.norm(dist, ord=2, axis=2)
            # modulate dist with heat value, see `use_detection_val`
            cost = np.round(l2_dist) * 100 - heats[jid, m, None]
            # pad the cost matrix, otherwise new pose are ignored
            if num_valid > num_clusters:
                cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),
                              'constant',
                              constant_values=((0, 0), (0, 1e-10)))
            rows, cols = linear_sum_assignment(cost)
            for y, x in zip(rows, cols):
                tag = tags[jid, y]
                if y < num_valid and x < num_clusters and \
                   l2_dist[y, x] < self.tag_thresh:
                    key = candidates[x]  # merge to cluster
                else:
                    key = tag[0]  # initialize new cluster
                cluster[key]['tags'].append(tag)
                cluster[key]['scores'][jid] = heats[jid, y]
                cluster[key]['coords'][jid] = coords[jid, y]

        # shape is [k, J, 2] and [k, J]
        pose_tags = np.array([cluster[k]['tags'] for k in cluster])
        pose_coords = np.array([cluster[k]['coords'] for k in cluster])
        pose_scores = np.array([cluster[k]['scores'] for k in cluster])
        valid = pose_scores > 0

        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
        if valid.sum() == 0:
            return pose_kpts, pose_kpts

        # refine coords
        valid_coords = pose_coords[valid].astype(np.int32)
        y = valid_coords[..., 0].flatten()
        x = valid_coords[..., 1].flatten()
        _, j = np.nonzero(valid)
        offsets = self.lerp(j, y, x, heatmap)
        pose_coords[valid, 0] += offsets[0]
        pose_coords[valid, 1] += offsets[1]

        # mean score before salvage
        mean_score = pose_scores.mean(axis=1)
        pose_kpts[valid, 2] = pose_scores[valid]

        # salvage missing joints
        if True:
            for pid, coords in enumerate(pose_coords):
                tag_mean = np.array(pose_tags[pid]).mean(axis=0)
                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
                score = heatmap - np.round(norm)  # (J, H, W)
                flat_score = score.reshape(J, -1)
                max_inds = np.argmax(flat_score, axis=1)
                max_scores = np.max(flat_score, axis=1)
                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
                if salvage_joints.sum() == 0:
                    continue
                y = max_inds[salvage_joints] // W
                x = max_inds[salvage_joints] % W
                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
                y = y.astype(np.float32) + offsets[0]
                x = x.astype(np.float32) + offsets[1]
                pose_coords[pid][salvage_joints, 0] = y
                pose_coords[pid][salvage_joints, 1] = x
                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
                                       original_height, original_width,
                                       min(H, W))
        return pose_kpts, mean_score


================================================
FILE: ppdet/modeling/architectures/keypoint_hrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at 
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and 
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import numpy as np
import math
import cv2
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ..keypoint_utils import transform_preds
from .. import layers as L
from paddle.nn import functional as F

__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']


@register
class TopDownHRNet(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(self,
                 width,
                 num_joints,
                 backbone='HRNet',
                 loss='KeyPointMSELoss',
                 post_process='HRNetPostProcess',
                 flip_perm=None,
                 flip=True,
                 shift_heatmap=True,
                 use_dark=True):
        """
        HRNet network, see https://arxiv.org/abs/1902.09212
 
        Args:
            backbone (nn.Layer): backbone instance
            post_process (object): `HRNetPostProcess` instance
            flip_perm (list): The left-right joints exchange order list
            use_dark(bool): Whether to use DARK in post processing
        """
        super(TopDownHRNet, self).__init__()
        self.backbone = backbone
        self.post_process = HRNetPostProcess(use_dark)
        self.loss = loss
        self.flip_perm = flip_perm
        self.flip = flip
        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
        self.shift_heatmap = shift_heatmap
        self.deploy = False

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        return {'backbone': backbone, }

    def _forward(self):
        feats = self.backbone(self.inputs)
        hrnet_outputs = self.final_conv(feats[0])

        if self.training:
            return self.loss(hrnet_outputs, self.inputs)
        elif self.deploy:
            outshape = hrnet_outputs.shape
            max_idx = paddle.argmax(
                hrnet_outputs.reshape(
                    (outshape[0], outshape[1], outshape[2] * outshape[3])),
                axis=-1)
            return hrnet_outputs, max_idx
        else:
            if self.flip:
                self.inputs['image'] = self.inputs['image'].flip([3])
                feats = self.backbone(self.inputs)
                output_flipped = self.final_conv(feats[0])
                output_flipped = self.flip_back(output_flipped.numpy(),
                                                self.flip_perm)
                output_flipped = paddle.to_tensor(output_flipped.copy())
                if self.shift_heatmap:
                    output_flipped[:, :, :, 1:] = output_flipped.clone(
                    )[:, :, :, 0:-1]
                hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5
            imshape = (self.inputs['im_shape'].numpy()
                       )[:, ::-1] if 'im_shape' in self.inputs else None
            center = self.inputs['center'].numpy(
            ) if 'center' in self.inputs else np.round(imshape / 2.)
            scale = self.inputs['scale'].numpy(
            ) if 'scale' in self.inputs else imshape / 200.
            outputs = self.post_process(hrnet_outputs, center, scale)
            return outputs

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        res_lst = self._forward()
        outputs = {'keypoint': res_lst}
        return outputs

    def flip_back(self, output_flipped, matched_parts):
        assert output_flipped.ndim == 4,\
                'output_flipped should be [batch_size, num_joints, height, width]'

        output_flipped = output_flipped[:, :, :, ::-1]

        for pair in matched_parts:
            tmp = output_flipped[:, pair[0], :, :].copy()
            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
            output_flipped[:, pair[1], :, :] = tmp

        return output_flipped


class HRNetPostProcess(object):
    def __init__(self, use_dark=True):
        self.use_dark = use_dark

    def get_max_preds(self, heatmaps):
        '''get predictions from score maps
 
        Args:
            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
 
        Returns:
            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
        '''
        assert isinstance(heatmaps,
                          np.ndarray), 'heatmaps should be numpy.ndarray'
        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'

        batch_size = heatmaps.shape[0]
        num_joints = heatmaps.shape[1]
        width = heatmaps.shape[3]
        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
        idx = np.argmax(heatmaps_reshaped, 2)
        maxvals = np.amax(heatmaps_reshaped, 2)

        maxvals = maxvals.reshape((batch_size, num_joints, 1))
        idx = idx.reshape((batch_size, num_joints, 1))

        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)

        preds[:, :, 0] = (preds[:, :, 0]) % width
        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)

        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
        pred_mask = pred_mask.astype(np.float32)

        preds *= pred_mask

        return preds, maxvals

    def gaussian_blur(self, heatmap, kernel):
        border = (kernel - 1) // 2
        batch_size = heatmap.shape[0]
        num_joints = heatmap.shape[1]
        height = heatmap.shape[2]
        width = heatmap.shape[3]
        for i in range(batch_size):
            for j in range(num_joints):
                origin_max = np.max(heatmap[i, j])
                dr = np.zeros((height + 2 * border, width + 2 * border))
                dr[border:-border, border:-border] = heatmap[i, j].copy()
                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
                heatmap[i, j] = dr[border:-border, border:-border].copy()
                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
        return heatmap

    def dark_parse(self, hm, coord):
        heatmap_height = hm.shape[0]
        heatmap_width = hm.shape[1]
        px = int(coord[0])
        py = int(coord[1])
        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
                + hm[py-1][px-1])
            dyy = 0.25 * (
                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
            derivative = np.matrix([[dx], [dy]])
            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
            if dxx * dyy - dxy**2 != 0:
                hessianinv = hessian.I
                offset = -hessianinv * derivative
                offset = np.squeeze(np.array(offset.T), axis=0)
                coord += offset
        return coord

    def dark_postprocess(self, hm, coords, kernelsize):
        '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
        Representation for Human Pose Estimation (CVPR 2020).
        '''

        hm = self.gaussian_blur(hm, kernelsize)
        hm = np.maximum(hm, 1e-10)
        hm = np.log(hm)
        for n in range(coords.shape[0]):
            for p in range(coords.shape[1]):
                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
        return coords

    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
        """the highest heatvalue location with a quarter offset in the
        direction from the highest response to the second highest response.
 
        Args:
            heatmaps (numpy.ndarray): The predicted heatmaps
            center (numpy.ndarray): The boxes center
            scale (numpy.ndarray): The scale factor
 
        Returns:
            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
        """
        coords, maxvals = self.get_max_preds(heatmaps)

        heatmap_height = heatmaps.shape[2]
        heatmap_width = heatmaps.shape[3]

        if self.use_dark:
            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
        else:
            for n in range(coords.shape[0]):
                for p in range(coords.shape[1]):
                    hm = heatmaps[n][p]
                    px = int(math.floor(coords[n][p][0] + 0.5))
                    py = int(math.floor(coords[n][p][1] + 0.5))
                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
                        diff = np.array([
                            hm[py][px + 1] - hm[py][px - 1],
                            hm[py + 1][px] - hm[py - 1][px]
                        ])
                        coords[n][p] += np.sign(diff) * .25
        preds = coords.copy()

        # Transform back
        for i in range(coords.shape[0]):
            preds[i] = transform_preds(coords[i], center[i], scale[i],
                                       [heatmap_width, heatmap_height])

        return preds, maxvals

    def __call__(self, output, center, scale):
        preds, maxvals = self.get_final_preds(output.numpy(), center, scale)
        outputs = [[
            np.concatenate(
                (preds, maxvals), axis=-1), np.mean(
                    maxvals, axis=1)
        ]]
        return outputs


class TinyPose3DPostProcess(object):
    def __init__(self):
        pass

    def __call__(self, output, center, scale):
        """
        Args:
            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
            scale (numpy.ndarray): The scale factor
        Returns:
            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
        """

        preds = output.numpy().copy()

        # Transform back
        for i in range(output.shape[0]):  # batch_size
            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]

        return preds


def soft_argmax(heatmaps, joint_num):
    dims = heatmaps.shape
    depth_dim = (int)(dims[1] / joint_num)
    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
    heatmaps = F.softmax(heatmaps, 2)
    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))

    accu_x = heatmaps.sum(axis=(2, 3))
    accu_y = heatmaps.sum(axis=(2, 4))
    accu_z = heatmaps.sum(axis=(3, 4))

    accu_x = accu_x * paddle.arange(1, 33)
    accu_y = accu_y * paddle.arange(1, 33)
    accu_z = accu_z * paddle.arange(1, 33)

    accu_x = accu_x.sum(axis=2, keepdim=True) - 1
    accu_y = accu_y.sum(axis=2, keepdim=True) - 1
    accu_z = accu_z.sum(axis=2, keepdim=True) - 1

    coord_out = paddle.concat(
        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]

    return coord_out


@register
class TinyPose3DHRHeatmapNet(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(
            self,
            width,  # 40, backbone输出的channel数目
            num_joints,
            backbone='HRNet',
            loss='KeyPointRegressionMSELoss',
            post_process=TinyPose3DPostProcess):
        """
        Args:
            backbone (nn.Layer): backbone instance
            post_process (object): post process instance
        """
        super(TinyPose3DHRHeatmapNet, self).__init__()

        self.backbone = backbone
        self.post_process = TinyPose3DPostProcess()
        self.loss = loss
        self.deploy = False
        self.num_joints = num_joints

        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        return {'backbone': backbone, }

    def _forward(self):
        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]

        hrnet_outputs = self.final_conv(feats[0])
        res = soft_argmax(hrnet_outputs, self.num_joints)
        return res

    def get_loss(self):
        pose3d = self._forward()
        loss = self.loss(pose3d, None, self.inputs)
        outputs = {'loss': loss}
        return outputs

    def get_pred(self):
        res_lst = self._forward()
        outputs = {'pose3d': res_lst}
        return outputs

    def flip_back(self, output_flipped, matched_parts):
        assert output_flipped.ndim == 4,\
                'output_flipped should be [batch_size, num_joints, height, width]'

        output_flipped = output_flipped[:, :, :, ::-1]

        for pair in matched_parts:
            tmp = output_flipped[:, pair[0], :, :].copy()
            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
            output_flipped[:, pair[1], :, :] = tmp

        return output_flipped


@register
class TinyPose3DHRNet(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(self,
                 width,
                 num_joints,
                 fc_channel=768,
                 backbone='HRNet',
                 loss='KeyPointRegressionMSELoss',
                 post_process=TinyPose3DPostProcess):
        """
        Args:
            backbone (nn.Layer): backbone instance
            post_process (object): post process instance
        """
        super(TinyPose3DHRNet, self).__init__()
        self.backbone = backbone
        self.post_process = TinyPose3DPostProcess()
        self.loss = loss
        self.deploy = False
        self.num_joints = num_joints

        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)

        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
        self.fc1 = paddle.nn.Linear(fc_channel, 256)
        self.act1 = paddle.nn.ReLU()
        self.fc2 = paddle.nn.Linear(256, 64)
        self.act2 = paddle.nn.ReLU()
        self.fc3 = paddle.nn.Linear(64, 3)

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        return {'backbone': backbone, }

    def _forward(self):
        '''
        self.inputs is a dict
        '''
        feats = self.backbone(
            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]

        hrnet_outputs = self.final_conv(
            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]

        flatten_res = self.flatten(
            hrnet_outputs)  # [batch_size,num_joints*32,32*32]

        res = self.fc1(flatten_res)
        res = self.act1(res)
        res = self.fc2(res)
        res = self.act2(res)
        res = self.fc3(res)

        if self.training:
            return self.loss(res, self.inputs)
        else:  # export model need
            return res

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        res_lst = self._forward()
        outputs = {'pose3d': res_lst}
        return outputs

    def flip_back(self, output_flipped, matched_parts):
        assert output_flipped.ndim == 4,\
                'output_flipped should be [batch_size, num_joints, height, width]'

        output_flipped = output_flipped[:, :, :, ::-1]

        for pair in matched_parts:
            tmp = output_flipped[:, pair[0], :, :].copy()
            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
            output_flipped[:, pair[1], :, :] = tmp

        return output_flipped


================================================
FILE: ppdet/modeling/architectures/keypoint_petr.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at 
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and 
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register
from .meta_arch import BaseArch
from .. import layers as L

__all__ = ['PETR']


@register
class PETR(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['backbone', 'neck', 'bbox_head']

    def __init__(self,
                 backbone='ResNet',
                 neck='ChannelMapper',
                 bbox_head='PETRHead'):
        """
        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf

        Args:
            backbone (nn.Layer): backbone instance
            neck (nn.Layer): neck between backbone and head
            bbox_head (nn.Layer): model output and loss
        """
        super(PETR, self).__init__()
        self.backbone = backbone
        if neck is not None:
            self.with_neck = True
        self.neck = neck
        self.bbox_head = bbox_head
        self.deploy = False

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    def get_inputs(self):
        img_metas = []
        gt_bboxes = []
        gt_labels = []
        gt_keypoints = []
        gt_areas = []
        pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
        for idx, im_shape in enumerate(self.inputs['im_shape']):
            img_meta = {
                'img_shape': im_shape.astype("int32").tolist() + [1, ],
                'batch_input_shape': self.inputs['image'].shape[-2:],
                'image_name': self.inputs['image_file'][idx]
            }
            img_metas.append(img_meta)
            if (not pad_gt_mask[idx].any()):
                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
                gt_labels.append(self.inputs['gt_class'][idx][:1])
                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
                gt_areas.append(self.inputs['gt_areas'][idx][:1])
                continue

            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])

        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas

    def get_loss(self):
        """
        Args:
            img (Tensor): Input images of shape (N, C, H, W).
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): A List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
                image in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): Class indices corresponding to each box.
            gt_keypoints (list[Tensor]): Each item are the truth keypoints for
                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
                p^{K}_y, p^{K}_v] format.
            gt_areas (list[Tensor]): mask areas corresponding to each box.
            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
                boxes can be ignored when computing the loss.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """

        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
        )
        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)

        x = self.extract_feat(self.inputs)
        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
                                              gt_labels, gt_keypoints, gt_areas,
                                              gt_bboxes_ignore)
        loss = 0
        for k, v in losses.items():
            loss += v
        losses['loss'] = loss

        return losses

    def get_pred_numpy(self):
        """Used for computing network flops.
        """

        img = self.inputs['image']
        batch_size, _, height, width = img.shape
        dummy_img_metas = [
            dict(
                batch_input_shape=(height, width),
                img_shape=(height, width, 3),
                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
        ]
        x = self.extract_feat(img)
        outs = self.bbox_head(x, img_metas=dummy_img_metas)
        bbox_list = self.bbox_head.get_bboxes(
            *outs, dummy_img_metas, rescale=True)
        return bbox_list

    def get_pred(self):
        """
        """
        img = self.inputs['image']
        batch_size, _, height, width = img.shape
        img_metas = [
            dict(
                batch_input_shape=(height, width),
                img_shape=(height, width, 3),
                scale_factor=self.inputs['scale_factor'][i])
            for i in range(batch_size)
        ]
        kptpred = self.simple_test(
            self.inputs, img_metas=img_metas, rescale=True)
        keypoints = kptpred[0][1][0]
        bboxs = kptpred[0][0][0]
        keypoints[..., 2] = bboxs[:, None, 4]
        res_lst = [[keypoints, bboxs[:, 4]]]
        outputs = {'keypoint': res_lst}
        return outputs

    def simple_test(self, inputs, img_metas, rescale=False):
        """Test function without test time augmentation.

        Args:
            inputs (list[paddle.Tensor]): List of multiple images.
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox and keypoint results of each image
                and classes. The outer list corresponds to each image.
                The inner list corresponds to each class.
        """
        batch_size = len(img_metas)
        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
            f'mode is supported. Found batch_size {batch_size}.'
        feat = self.extract_feat(inputs)
        results_list = self.bbox_head.simple_test(
            feat, img_metas, rescale=rescale)

        bbox_kpt_results = [
            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
                                 self.bbox_head.num_classes)
            for det_bboxes, det_labels, det_kpts in results_list
        ]
        return bbox_kpt_results

    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
        """Convert detection results to a list of numpy arrays.

        Args:
            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
            labels (paddle.Tensor | np.ndarray): shape (n, ).
            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
            num_classes (int): class number, including background class.

        Returns:
            list(ndarray): bbox and keypoint results of each class.
        """
        if bboxes.shape[0] == 0:
            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)
                    for i in range(num_classes)]
        else:
            if isinstance(bboxes, paddle.Tensor):
                bboxes = bboxes.numpy()
                labels = labels.numpy()
                kpts = kpts.numpy()
            return [bboxes[labels == i, :] for i in range(num_classes)], \
                [kpts[labels == i, :, :] for i in range(num_classes)]


================================================
FILE: ppdet/modeling/architectures/keypoint_vitpose.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at 
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and 
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import numpy as np
import math
import cv2
from ppdet.core.workspace import register, create, serializable
from .meta_arch import BaseArch
from ..keypoint_utils import transform_preds
from .. import layers as L

__all__ = ['VitPose_TopDown', 'VitPosePostProcess']


@register
class VitPose_TopDown(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(self, backbone, head, loss, post_process, flip_test):
        """
        VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf

        Args:
            backbone (nn.Layer): backbone instance
            post_process (object): `HRNetPostProcess` instance
            
        """
        super(VitPose_TopDown, self).__init__()
        self.backbone = backbone
        self.head = head
        self.loss = loss
        self.post_process = post_process
        self.flip_test = flip_test

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        #head
        head = create(cfg['head'])
        #post_process
        post_process = create(cfg['post_process'])

        return {
            'backbone': backbone,
            'head': head,
            'post_process': post_process
        }

    def _forward_train(self):

        feats = self.backbone.forward_features(self.inputs['image'])
        vitpost_output = self.head(feats)
        return self.loss(vitpost_output, self.inputs)

    def _forward_test(self):

        feats = self.backbone.forward_features(self.inputs['image'])
        output_heatmap = self.head(feats)

        if self.flip_test:
            img_flipped = self.inputs['image'].flip(3)
            features_flipped = self.backbone.forward_features(img_flipped)
            output_flipped_heatmap = self.head.inference_model(features_flipped,
                                                               self.flip_test)

            output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5

        imshape = (self.inputs['im_shape'].numpy()
                   )[:, ::-1] if 'im_shape' in self.inputs else None
        center = self.inputs['center'].numpy(
        ) if 'center' in self.inputs else np.round(imshape / 2.)
        scale = self.inputs['scale'].numpy(
        ) if 'scale' in self.inputs else imshape / 200.

        result = self.post_process(output_heatmap.cpu().numpy(), center, scale)

        return result

    def get_loss(self):
        return self._forward_train()

    def get_pred(self):
        res_lst = self._forward_test()
        outputs = {'keypoint': res_lst}
        return outputs


@register
@serializable
class VitPosePostProcess(object):
    def __init__(self, use_dark=False):
        self.use_dark = use_dark

    def get_max_preds(self, heatmaps):
        '''get predictions from score maps

        Args:
            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])

        Returns:
            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
        '''
        assert isinstance(heatmaps,
                          np.ndarray), 'heatmaps should be numpy.ndarray'
        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'

        batch_size = heatmaps.shape[0]
        num_joints = heatmaps.shape[1]
        width = heatmaps.shape[3]
        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
        idx = np.argmax(heatmaps_reshaped, 2)
        maxvals = np.amax(heatmaps_reshaped, 2)

        maxvals = maxvals.reshape((batch_size, num_joints, 1))
        idx = idx.reshape((batch_size, num_joints, 1))

        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)

        preds[:, :, 0] = (preds[:, :, 0]) % width
        preds[:, :, 1] = np.floor((preds[:, :, 1]) // width)

        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
        pred_mask = pred_mask.astype(np.float32)

        preds *= pred_mask

        return preds, maxvals

    def post_datk_udp(self, coords, batch_heatmaps, kernel=3):
        """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
        Devil is in the Details: Delving into Unbiased Data Processing for Human
        Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
        Representation for Human Pose Estimation (CVPR 2020).

        Note:
            - batch size: B
            - num keypoints: K
            - num persons: N
            - height of heatmaps: H
            - width of heatmaps: W

            B=1 for bottom_up paradigm where all persons share the same heatmap.
            B=N for top_down paradigm where each person has its own heatmaps.

        Args:
            coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
            batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
            kernel (int): Gaussian kernel size (K) for modulation.

        Returns:
            np.ndarray([N, K, 2]): Refined coordinates.
        """
        if not isinstance(batch_heatmaps, np.ndarray):
            batch_heatmaps = batch_heatmaps.cpu().numpy()
        B, K, H, W = batch_heatmaps.shape
        N = coords.shape[0]
        assert (B == 1 or B == N)
        for heatmaps in batch_heatmaps:
            for heatmap in heatmaps:
                cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
        np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
        np.log(batch_heatmaps, batch_heatmaps)

        batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1),
                                                     (1, 1)),
                                    mode='edge').flatten()

        index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
        index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
        index = index.astype(int).reshape(-1, 1)
        i_ = batch_heatmaps_pad[index]
        ix1 = batch_heatmaps_pad[index + 1]
        iy1 = batch_heatmaps_pad[index + W + 2]
        ix1y1 = batch_heatmaps_pad[index + W + 3]
        ix1_y1_ = batch_heatmaps_pad[index - W - 3]
        ix1_ = batch_heatmaps_pad[index - 1]
        iy1_ = batch_heatmaps_pad[index - 2 - W]

        dx = 0.5 * (ix1 - ix1_)
        dy = 0.5 * (iy1 - iy1_)
        derivative = np.concatenate([dx, dy], axis=1)
        derivative = derivative.reshape(N, K, 2, 1)
        dxx = ix1 - 2 * i_ + ix1_
        dyy = iy1 - 2 * i_ + iy1_
        dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
        hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
        hessian = hessian.reshape(N, K, 2, 2)
        hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
        coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
        return coords

    def transform_preds_udp(self,
                            coords,
                            center,
                            scale,
                            output_size,
                            use_udp=True):
        """Get final keypoint predictions from heatmaps and apply scaling and
        translation to map them back to the image.

        Note:
            num_keypoints: K

        Args:
            coords (np.ndarray[K, ndims]):

                * If ndims=2, corrds are predicted keypoint location.
                * If ndims=4, corrds are composed of (x, y, scores, tags)
                * If ndims=5, corrds are composed of (x, y, scores, tags,
                flipped_tags)

            center (np.ndarray[2, ]): Center of the bounding box (x, y).
            scale (np.ndarray[2, ]): Scale of the bounding box
                wrt [width, height].
            output_size (np.ndarray[2, ] | list(2,)): Size of the
                destination heatmaps.
            use_udp (bool): Use unbiased data processing

        Returns:
            np.ndarray: Predicted coordinates in the images.
        """

        assert coords.shape[1] in (2, 4, 5)
        assert len(center) == 2
        assert len(scale) == 2
        assert len(output_size) == 2

        # Recover the scale which is normalized by a factor of 200.
        scale = scale * 200.0

        if use_udp:
            scale_x = scale[0] / (output_size[0] - 1.0)
            scale_y = scale[1] / (output_size[1] - 1.0)
        else:
            scale_x = scale[0] / output_size[0]
            scale_y = scale[1] / output_size[1]

        target_coords = np.ones_like(coords)
        target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[
            0] * 0.5
        target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[
            1] * 0.5

        return target_coords

    def get_final_preds(self, heatmaps, center, scale, kernelsize=11):
        """the highest heatvalue location with a quarter offset in the
        direction from the highest response to the second highest response.

        Args:
            heatmaps (numpy.ndarray): The predicted heatmaps
            center (numpy.ndarray): The boxes center
            scale (numpy.ndarray): The scale factor

        Returns:
            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
        """
        coords, maxvals = self.get_max_preds(heatmaps)

        N, K, H, W = heatmaps.shape

        if self.use_dark:
            coords = self.post_datk_udp(coords, heatmaps, kernelsize)
            preds = coords.copy()
            # Transform back to the image
            for i in range(N):
                preds[i] = self.transform_preds_udp(preds[i], center[i],
                                                    scale[i], [W, H])
        else:
            for n in range(coords.shape[0]):
                for p in range(coords.shape[1]):
                    hm = heatmaps[n][p]
                    px = int(math.floor(coords[n][p][0] + 0.5))
                    py = int(math.floor(coords[n][p][1] + 0.5))
                    if 1 < px < W - 1 and 1 < py < H - 1:
                        diff = np.array([
                            hm[py][px + 1] - hm[py][px - 1],
                            hm[py + 1][px] - hm[py - 1][px]
                        ])
                        coords[n][p] += np.sign(diff) * .25
            preds = coords.copy()

            # Transform back
            for i in range(coords.shape[0]):
                preds[i] = transform_preds(coords[i], center[i], scale[i],
                                           [W, H])

        return preds, maxvals

    def __call__(self, output, center, scale):
        preds, maxvals = self.get_final_preds(output, center, scale)
        outputs = [[
            np.concatenate(
                (preds, maxvals), axis=-1), np.mean(
                    maxvals, axis=1)
        ]]
        return outputs

================================================
FILE: ppdet/modeling/architectures/mask_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['MaskRCNN']


@register
class MaskRCNN(BaseArch):
    """
    Mask R-CNN network, see https://arxiv.org/abs/1703.06870

    Args:
        backbone (object): backbone instance
        rpn_head (object): `RPNHead` instance
        bbox_head (object): `BBoxHead` instance
        mask_head (object): `MaskHead` instance
        bbox_post_process (object): `BBoxPostProcess` instance
        mask_post_process (object): `MaskPostProcess` instance
        neck (object): 'FPN' instance
    """

    __category__ = 'architecture'
    __inject__ = [
        'bbox_post_process',
        'mask_post_process',
    ]

    def __init__(self,
                 backbone,
                 rpn_head,
                 bbox_head,
                 mask_head,
                 bbox_post_process,
                 mask_post_process,
                 neck=None):
        super(MaskRCNN, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.mask_head = mask_head

        self.bbox_post_process = bbox_post_process
        self.mask_post_process = mask_post_process

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])
        kwargs = {'input_shape': backbone.out_shape}
        neck = cfg['neck'] and create(cfg['neck'], **kwargs)

        out_shape = neck and neck.out_shape or backbone.out_shape
        kwargs = {'input_shape': out_shape}
        rpn_head = create(cfg['rpn_head'], **kwargs)
        bbox_head = create(cfg['bbox_head'], **kwargs)

        out_shape = neck and out_shape or bbox_head.get_head().out_shape
        kwargs = {'input_shape': out_shape}
        mask_head = create(cfg['mask_head'], **kwargs)
        return {
            'backbone': backbone,
            'neck': neck,
            "rpn_head": rpn_head,
            "bbox_head": bbox_head,
            "mask_head": mask_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
            body_feats = self.neck(body_feats)

        if self.training:
            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
                                                  self.inputs)
            rois, rois_num = self.bbox_head.get_assigned_rois()
            bbox_targets = self.bbox_head.get_assigned_targets()
            # Mask Head needs bbox_feat in Mask RCNN
            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
                                       bbox_targets, bbox_feat)
            return rpn_loss, bbox_loss, mask_loss
        else:
            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)

            im_shape = self.inputs['im_shape']
            scale_factor = self.inputs['scale_factor']

            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
                preds, (rois, rois_num), im_shape, scale_factor)
            mask_out = self.mask_head(
                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)

            # rescale the prediction back to origin image
            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
                bbox, bbox_num, im_shape, scale_factor)
            origin_shape = self.bbox_post_process.get_origin_shape()
            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
                                               origin_shape)

            if self.use_extra_data:
                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
                """extra_data:{
                            'scores': predict scores,
                            'nms_keep_idx': bbox index before nms,
                           }
                """
                extra_data['scores'] = preds[1]  # predict scores (probability)
                # Todo: get logits output
                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
                return bbox_pred, bbox_num, mask_pred, extra_data
            else:
                return bbox_pred, bbox_num, mask_pred

    def get_loss(self, ):
        bbox_loss, mask_loss, rpn_loss = self._forward()
        loss = {}
        loss.update(rpn_loss)
        loss.update(bbox_loss)
        loss.update(mask_loss)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        if self.use_extra_data:
            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
        else:
            bbox_pred, bbox_num, mask_pred = self._forward()
            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
        return output


================================================
FILE: ppdet/modeling/architectures/meta_arch.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.nn as nn
import typing

from ppdet.core.workspace import register
from ppdet.modeling.post_process import nms

__all__ = ['BaseArch']


@register
class BaseArch(nn.Layer):
    def __init__(self, data_format='NCHW', use_extra_data=False):
        super(BaseArch, self).__init__()
        self.data_format = data_format
        self.inputs = {}
        self.fuse_norm = False
        self.use_extra_data = use_extra_data

    def load_meanstd(self, cfg_transform):
        scale = 1.
        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        for item in cfg_transform:
            if 'NormalizeImage' in item:
                mean = np.array(
                    item['NormalizeImage']['mean'], dtype=np.float32)
                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
                if item['NormalizeImage'].get('is_scale', True):
                    scale = 1. / 255.
                break
        if self.data_format == 'NHWC':
            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
        else:
            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))

    def forward(self, inputs):
        if self.data_format == 'NHWC':
            image = inputs['image']
            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])

        if self.fuse_norm:
            image = inputs['image']
            self.inputs['image'] = image * self.scale + self.bias
            self.inputs['im_shape'] = inputs['im_shape']
            self.inputs['scale_factor'] = inputs['scale_factor']
        else:
            self.inputs = inputs

        self.model_arch()

        if self.training:
            out = self.get_loss()
        else:
            inputs_list = []
            # multi-scale input
            if not isinstance(inputs, typing.Sequence):
                inputs_list.append(inputs)
            else:
                inputs_list.extend(inputs)
            outs = []
            for inp in inputs_list:
                if self.fuse_norm:
                    self.inputs['image'] = inp['image'] * self.scale + self.bias
                    self.inputs['im_shape'] = inp['im_shape']
                    self.inputs['scale_factor'] = inp['scale_factor']
                else:
                    self.inputs = inp
                outs.append(self.get_pred())

            # multi-scale test
            if len(outs) > 1:
                out = self.merge_multi_scale_predictions(outs)
            else:
                out = outs[0]
        return out

    def merge_multi_scale_predictions(self, outs):
        # default values for architectures not included in following list
        num_classes = 80
        nms_threshold = 0.5
        keep_top_k = 100

        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
            num_classes = self.bbox_head.num_classes
            keep_top_k = self.bbox_post_process.nms.keep_top_k
            nms_threshold = self.bbox_post_process.nms.nms_threshold
        else:
            raise Exception(
                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
            )

        final_boxes = []
        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
        for c in range(num_classes):
            idxs = all_scale_outs[:, 0] == c
            if np.count_nonzero(idxs) == 0:
                continue
            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
            final_boxes.append(
                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
        out = np.concatenate(final_boxes)
        out = np.concatenate(sorted(
            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
        out = {
            'bbox': paddle.to_tensor(out),
            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
        }

        return out

    def build_inputs(self, data, input_def):
        inputs = {}
        for i, k in enumerate(input_def):
            inputs[k] = data[i]
        return inputs

    def model_arch(self, ):
        pass

    def get_loss(self, ):
        raise NotImplementedError("Should implement get_loss method!")

    def get_pred(self, ):
        raise NotImplementedError("Should implement get_pred method!")


================================================
FILE: ppdet/modeling/architectures/multi_stream_detector.py
================================================
from typing import Dict
from collections import OrderedDict
from ppdet.modeling.architectures.meta_arch import BaseArch


class MultiSteamDetector(BaseArch):
    def __init__(self,
                 model: Dict[str, BaseArch],
                 train_cfg=None,
                 test_cfg=None):
        super(MultiSteamDetector, self).__init__()
        self.submodules = list(model.keys())
        for k, v in model.items():
            setattr(self, k, v)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.inference_on = self.test_cfg.get("inference_on",
                                              self.submodules[0])
        self.first_load = True

    def forward(self, inputs, return_loss=True, **kwargs):
        """Calls either :func:`forward_train` or :func:`forward_test` depending
        on whether ``return_loss`` is ``True``.

        Note this setting will change the expected inputs. When
        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
        and List[dict]), and when ``resturn_loss=False``, img and img_meta
        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
        the outer list indicating test time augmentations.
        """
        if return_loss:
            return self.forward_train(inputs, **kwargs)
        else:
            return self.forward_test(inputs, **kwargs)

    def get_loss(self, **kwargs):
        # losses = self(**data)

        return self.forward_train(self, **kwargs)

    def model(self, **kwargs) -> BaseArch:
        if "submodule" in kwargs:
            assert (kwargs["submodule"] in self.submodules
                    ), "Detector does not contain submodule {}".format(kwargs[
                        "submodule"])
            model: BaseArch = getattr(self, kwargs["submodule"])
        else:
            model: BaseArch = getattr(self, self.inference_on)
        return model

    def freeze(self, model_ref: str):
        assert model_ref in self.submodules
        model = getattr(self, model_ref)
        model.eval()
        for param in model.parameters():
            param.stop_gradient = True

    def update_ema_model(self, momentum=0.9996):
        # print(momentum)
        model_dict = self.student.state_dict()
        new_dict = OrderedDict()
        for key, value in self.teacher.state_dict().items():
            if key in model_dict.keys():
                new_dict[key] = (model_dict[key] *
                                 (1 - momentum) + value * momentum)
            else:
                raise Exception("{} is not found in student model".format(key))
        self.teacher.set_dict(new_dict)


================================================
FILE: ppdet/modeling/architectures/picodet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['PicoDet']


@register
class PicoDet(BaseArch):
    """
    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388

    Args:
        backbone (object): backbone instance
        neck (object): 'FPN' instance
        head (object): 'PicoHead' instance
    """

    __category__ = 'architecture'

    def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False):
        super(PicoDet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.export_post_process = True
        self.export_nms = True
        self.nms_cpu = nms_cpu

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)
        head_outs = self.head(fpn_feats, self.export_post_process)
        if self.training or not self.export_post_process:
            return head_outs, None
        else:
            scale_factor = self.inputs['scale_factor']
            bboxes, bbox_num = self.head.post_process(
                head_outs,
                scale_factor,
                export_nms=self.export_nms,
                nms_cpu=self.nms_cpu)
            return bboxes, bbox_num

    def get_loss(self, ):
        loss = {}

        head_outs, _ = self._forward()
        loss_gfl = self.head.get_loss(head_outs, self.inputs)
        loss.update(loss_gfl)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        if not self.export_post_process:
            return {'picodet': self._forward()[0]}
        elif self.export_nms:
            bbox_pred, bbox_num = self._forward()
            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
            return output
        else:
            bboxes, mlvl_scores = self._forward()
            output = {'bbox': bboxes, 'scores': mlvl_scores}
            return output


================================================
FILE: ppdet/modeling/architectures/pose3d_metro.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at 
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and 
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from .. import layers as L

__all__ = ['METRO_Body']


def orthographic_projection(X, camera):
    """Perform orthographic projection of 3D points X using the camera parameters
    Args:
        X: size = [B, N, 3]
        camera: size = [B, 3]
    Returns:
        Projected 2D points -- size = [B, N, 2]
    """
    camera = camera.reshape((-1, 1, 3))
    X_trans = X[:, :, :2] + camera[:, :, 1:]
    shape = X_trans.shape
    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
    return X_2d


@register
class METRO_Body(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['loss']

    def __init__(
            self,
            num_joints,
            backbone='HRNet',
            trans_encoder='',
            loss='Pose3DLoss', ):
        """
        Modified from METRO network, see https://arxiv.org/abs/2012.09760

        Args:
            backbone (nn.Layer): backbone instance
        """
        super(METRO_Body, self).__init__()
        self.num_joints = num_joints
        self.backbone = backbone
        self.loss = loss
        self.deploy = False

        self.trans_encoder = trans_encoder
        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
        self.cam_param_fc = paddle.nn.Linear(3, 2)

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        trans_encoder = create(cfg['trans_encoder'])

        return {'backbone': backbone, 'trans_encoder': trans_encoder}

    def _forward(self):
        batch_size = self.inputs['image'].shape[0]

        image_feat = self.backbone(self.inputs)
        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
        # and apply a conv layer to learn image token for each 3d joint/vertex position
        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)

        if self.training:
            # apply mask vertex/joint modeling
            # meta_masks is a tensor of all the masks, randomly generated in dataloader
            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s
            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
            constant_tensor = paddle.ones_like(features) * 0.01
            features = features * meta_masks + constant_tensor * (1 - meta_masks
                                                                  )
        pred_out = self.trans_encoder(features)

        pred_3d_joints = pred_out[:, :self.num_joints, :]
        cam_features = pred_out[:, self.num_joints:, :]

        # learn camera parameters
        pred_2d_joints = self.cam_param_fc(cam_features)
        return pred_3d_joints, pred_2d_joints

    def get_loss(self):
        preds_3d, preds_2d = self._forward()
        loss = self.loss(preds_3d, preds_2d, self.inputs)
        output = {'loss': loss}
        return output

    def get_pred(self):
        preds_3d, preds_2d = self._forward()
        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
        return outputs


================================================
FILE: ppdet/modeling/architectures/ppyoloe.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head


@register
class PPYOLOE(BaseArch):
    """
    PPYOLOE network, see https://arxiv.org/abs/2203.16250

    Args:
        backbone (nn.Layer): backbone instance
        neck (nn.Layer): neck instance
        yolo_head (nn.Layer): anchor_head instance
        post_process (object): `BBoxPostProcess` instance
        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
        for_distill (bool): whether for distillation
        feat_distill_place (str): distill which feature for distillation
        for_mot (bool): whether return other features for multi-object tracking
            models, default False in pure object detection models.
    """

    __category__ = 'architecture'
    __shared__ = ['for_distill']
    __inject__ = ['post_process', 'ssod_loss']

    def __init__(self,
                 backbone='CSPResNet',
                 neck='CustomCSPPAN',
                 yolo_head='PPYOLOEHead',
                 post_process='BBoxPostProcess',
                 ssod_loss='SSODPPYOLOELoss',
                 for_distill=False,
                 feat_distill_place='neck_feats',
                 with_mask=False,
                 for_mot=False):
        super(PPYOLOE, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.yolo_head = yolo_head
        self.post_process = post_process
        self.for_mot = for_mot
        self.with_mask = with_mask

        # for ssod, semi-det
        self.is_teacher = False
        self.ssod_loss = ssod_loss

        # distill
        self.for_distill = for_distill
        self.feat_distill_place = feat_distill_place
        if for_distill:
            assert feat_distill_place in ['backbone_feats', 'neck_feats']

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        yolo_head = create(cfg['yolo_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "yolo_head": yolo_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        neck_feats = self.neck(body_feats, self.for_mot)

        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
        if self.training or self.is_teacher:
            yolo_losses = self.yolo_head(neck_feats, self.inputs)

            if self.for_distill:
                if self.feat_distill_place == 'backbone_feats':
                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
                elif self.feat_distill_place == 'neck_feats':
                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
                else:
                    raise ValueError
            return yolo_losses
        else:

            yolo_head_outs = self.yolo_head(neck_feats)

            if self.post_process is not None:
                bbox, bbox_num, nms_keep_idx = self.post_process(
                    yolo_head_outs, self.yolo_head.mask_anchors,
                    self.inputs['im_shape'], self.inputs['scale_factor'])

            else:
                if not self.with_mask:
                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
                        yolo_head_outs, self.inputs['scale_factor'])
                else:
                    bbox, bbox_num, mask, nms_keep_idx = self.yolo_head.post_process(
                        yolo_head_outs,
                        im_shape=self.inputs['im_shape'],
                        scale_factor=self.inputs['scale_factor'],
                        infer_shape=self.inputs['image'].shape[2:])

            if not self.with_mask:
                output = {'bbox': bbox, 'bbox_num': bbox_num}
            else:
                output = {'bbox': bbox, 'bbox_num': bbox_num, 'mask': mask}

            if self.with_mask:
                output['mask'] = mask

            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()

    def get_loss_keys(self):
        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']

    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
                                     train_cfg)
        return ssod_losses


@register
class PPYOLOEWithAuxHead(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['post_process']

    def __init__(self,
                 backbone='CSPResNet',
                 neck='CustomCSPPAN',
                 yolo_head='PPYOLOEHead',
                 aux_head='SimpleConvHead',
                 post_process='BBoxPostProcess',
                 for_mot=False,
                 detach_epoch=5):
        """
        PPYOLOE network, see https://arxiv.org/abs/2203.16250

        Args:
            backbone (nn.Layer): backbone instance
            neck (nn.Layer): neck instance
            yolo_head (nn.Layer): anchor_head instance
            post_process (object): `BBoxPostProcess` instance
            for_mot (bool): whether return other features for multi-object tracking
                models, default False in pure object detection models.
        """
        super(PPYOLOEWithAuxHead, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.aux_neck = copy.deepcopy(self.neck)

        self.yolo_head = yolo_head
        self.aux_head = aux_head
        self.post_process = post_process
        self.for_mot = for_mot
        self.detach_epoch = detach_epoch

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)
        aux_neck = copy.deepcopy(neck)

        # head
        kwargs = {'input_shape': neck.out_shape}
        yolo_head = create(cfg['yolo_head'], **kwargs)
        aux_head = create(cfg['aux_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "yolo_head": yolo_head,
            'aux_head': aux_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        neck_feats = self.neck(body_feats, self.for_mot)

        if self.training:
            if self.inputs['epoch_id'] >= self.detach_epoch:
                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
                dual_neck_feats = (paddle.concat(
                    [f.detach(), aux_f], axis=1) for f, aux_f in
                                   zip(neck_feats, aux_neck_feats))
            else:
                aux_neck_feats = self.aux_neck(body_feats)
                dual_neck_feats = (paddle.concat(
                    [f, aux_f], axis=1) for f, aux_f in
                                   zip(neck_feats, aux_neck_feats))
            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
            loss = self.yolo_head(
                neck_feats,
                self.inputs,
                aux_pred=[aux_cls_scores, aux_bbox_preds])
            return loss
        else:
            yolo_head_outs = self.yolo_head(neck_feats)
            if self.post_process is not None:
                bbox, bbox_num = self.post_process(
                    yolo_head_outs, self.yolo_head.mask_anchors,
                    self.inputs['im_shape'], self.inputs['scale_factor'])
            else:
                bbox, bbox_num = self.yolo_head.post_process(
                    yolo_head_outs, self.inputs['scale_factor'])
            output = {'bbox': bbox, 'bbox_num': bbox_num}

            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()

================================================
FILE: ppdet/modeling/architectures/queryinst.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['QueryInst']


@register
class QueryInst(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['post_process']

    def __init__(self,
                 backbone,
                 neck,
                 rpn_head,
                 roi_head,
                 post_process='SparsePostProcess'):
        super(QueryInst, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.rpn_head = rpn_head
        self.roi_head = roi_head
        self.post_process = post_process

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        rpn_head = create(cfg['rpn_head'], **kwargs)
        roi_head = create(cfg['roi_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            'rpn_head': rpn_head,
            "roi_head": roi_head
        }

    def _forward(self, targets=None):
        features = self.backbone(self.inputs)
        features = self.neck(features)

        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
            'img_whwh'])
        outputs = self.roi_head(features, proposal_bboxes, proposal_features,
                                targets)

        if self.training:
            return outputs
        else:
            bbox_pred, bbox_num, mask_pred = self.post_process(
                outputs['class_logits'], outputs['bbox_pred'],
                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
                outputs['mask_logits'])
            return bbox_pred, bbox_num, mask_pred

    def get_loss(self):
        targets = []
        for i in range(len(self.inputs['img_whwh'])):
            boxes = self.inputs['gt_bbox'][i]
            labels = self.inputs['gt_class'][i].squeeze(-1)
            img_whwh = self.inputs['img_whwh'][i]
            if boxes.shape[0] != 0:
                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
            else:
                img_whwh_tgt = paddle.zeros_like(boxes)
            gt_segm = self.inputs['gt_segm'][i].astype('float32')
            targets.append({
                'boxes': boxes,
                'labels': labels,
                'img_whwh': img_whwh,
                'img_whwh_tgt': img_whwh_tgt,
                'gt_segm': gt_segm
            })
        losses = self._forward(targets)
        losses.update({'loss': sum(losses.values())})
        return losses

    def get_pred(self):
        bbox_pred, bbox_num, mask_pred = self._forward()
        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}


================================================
FILE: ppdet/modeling/architectures/retinanet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F

__all__ = ['RetinaNet']


@register
class RetinaNet(BaseArch):
    __category__ = 'architecture'

    def __init__(self, backbone, neck, head):
        super(RetinaNet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            'head': head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        neck_feats = self.neck(body_feats)

        if self.training:
            return self.head(neck_feats, self.inputs)
        else:
            head_outs = self.head(neck_feats)
            bbox, bbox_num, nms_keep_idx = self.head.post_process(
                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])

            if self.use_extra_data:
                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
                """extra_data:{
                            'scores': predict scores,
                            'nms_keep_idx': bbox index before nms,
                           }
                           """
                preds_logits = self.head.decode_cls_logits(head_outs[0])
                preds_scores = F.sigmoid(preds_logits)
                extra_data['logits'] = preds_logits
                extra_data['scores'] = preds_scores
                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
                return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
            else:
                return {'bbox': bbox, 'bbox_num': bbox_num}

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/rtdetrv3.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create

__all__ = ['RTDETRV3']
# Deformable DETR, DINO use the same architecture as DETR


@register
class RTDETRV3(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['post_process', 'post_process_semi']
    __shared__ = ['with_mask', 'exclude_post_process']

    def __init__(self,
                 backbone,
                 transformer='DETRTransformer',
                 detr_head='DETRHead',
                 neck=None,
                 aux_o2m_head=None,
                 post_process='DETRPostProcess',
                 post_process_semi=None,
                 with_mask=False,
                 exclude_post_process=False):
        super(RTDETRV3, self).__init__()
        self.backbone = backbone
        self.transformer = transformer
        self.detr_head = detr_head
        self.neck = neck
        self.aux_o2m_head = aux_o2m_head
        self.post_process = post_process
        self.with_mask = with_mask
        self.exclude_post_process = exclude_post_process
        self.post_process_semi = post_process_semi

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])
        # neck
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None

        # transformer
        if neck is not None:
            kwargs = {'input_shape': neck.out_shape}
        transformer = create(cfg['transformer'], **kwargs)
        # head
        kwargs = {
            'hidden_dim': transformer.hidden_dim,
            'nhead': transformer.nhead,
            'input_shape': backbone.out_shape
        }
        detr_head = create(cfg['detr_head'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        aux_o2m_head = create(cfg['aux_o2m_head'], **kwargs)

        return {
            'backbone': backbone,
            'transformer': transformer,
            "detr_head": detr_head,
            "neck": neck,
            "aux_o2m_head": aux_o2m_head
        }

    def _forward(self):
        # Backbone
        body_feats = self.backbone(self.inputs)

        # Neck
        if self.neck is not None:
            body_feats = self.neck(body_feats)

        # Transformer
        pad_mask = self.inputs.get('pad_mask', None)
        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)

        # DETR Head
        if self.training:
            detr_losses = self.detr_head(out_transformer, body_feats,
                                         self.inputs)
            detr_losses.update({
                'loss': paddle.add_n(
                    [v for k, v in detr_losses.items() if 'log' not in k])
            })
            if self.aux_o2m_head is not None:
                aux_o2m_losses = self.aux_o2m_head(body_feats, self.inputs)
                for k, v in aux_o2m_losses.items():
                    if k == 'loss':
                        detr_losses[k] += v
                    k = k + '_aux_o2m'
                    detr_losses[k] = v
            return detr_losses
        else:
            preds = self.detr_head(out_transformer, body_feats)
            if self.exclude_post_process:
                bbox, bbox_num, mask = preds
            else:
                bbox, bbox_num, mask = self.post_process(
                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
                    self.inputs['image'][2:].shape)

                # aux_o2m_outs = self.aux_o2m_head(body_feats)
                # bbox, bbox_num, nms_keep_idx = self.aux_o2m_head.post_process(
                #         aux_o2m_outs, self.inputs['scale_factor'])

            output = {'bbox': bbox, 'bbox_num': bbox_num}
            if self.with_mask:
                output['mask'] = mask
            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/s2anet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['S2ANet']


@register
class S2ANet(BaseArch):
    __category__ = 'architecture'
    __inject__ = ['head']

    def __init__(self, backbone, neck, head):
        """
        S2ANet, see https://arxiv.org/pdf/2008.09397.pdf

        Args:
            backbone (object): backbone instance
            neck (object): `FPN` instance
            head (object): `Head` instance
        """
        super(S2ANet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.s2anet_head = head

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])
        kwargs = {'input_shape': backbone.out_shape}
        neck = cfg['neck'] and create(cfg['neck'], **kwargs)

        out_shape = neck and neck.out_shape or backbone.out_shape
        kwargs = {'input_shape': out_shape}
        head = create(cfg['head'], **kwargs)

        return {'backbone': backbone, 'neck': neck, "head": head}

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
            body_feats = self.neck(body_feats)
        if self.training:
            loss = self.s2anet_head(body_feats, self.inputs)
            return loss
        else:
            head_outs = self.s2anet_head(body_feats)
            # post_process
            bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)
            # rescale the prediction back to origin image
            im_shape = self.inputs['im_shape']
            scale_factor = self.inputs['scale_factor']
            bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,
                                               scale_factor)
            # output
            output = {'bbox': bboxes, 'bbox_num': bbox_num}
            return output

    def get_loss(self, ):
        loss = self._forward()
        return loss

    def get_pred(self):
        output = self._forward()
        return output


================================================
FILE: ppdet/modeling/architectures/solov2.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['SOLOv2']


@register
class SOLOv2(BaseArch):
    """
    SOLOv2 network, see https://arxiv.org/abs/2003.10152

    Args:
        backbone (object): an backbone instance
        solov2_head (object): an `SOLOv2Head` instance
        mask_head (object): an `SOLOv2MaskHead` instance
        neck (object): neck of network, such as feature pyramid network instance
    """

    __category__ = 'architecture'

    def __init__(self, backbone, solov2_head, mask_head, neck=None):
        super(SOLOv2, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.solov2_head = solov2_head
        self.mask_head = mask_head

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        solov2_head = create(cfg['solov2_head'], **kwargs)
        mask_head = create(cfg['mask_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            'solov2_head': solov2_head,
            'mask_head': mask_head,
        }

    def model_arch(self):
        body_feats = self.backbone(self.inputs)

        body_feats = self.neck(body_feats)

        self.seg_pred = self.mask_head(body_feats)

        self.cate_pred_list, self.kernel_pred_list = self.solov2_head(
            body_feats)

    def get_loss(self, ):
        loss = {}
        # get gt_ins_labels, gt_cate_labels, etc.
        gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []
        fg_num = self.inputs['fg_num']
        for i in range(len(self.solov2_head.seg_num_grids)):
            ins_label = 'ins_label{}'.format(i)
            if ins_label in self.inputs:
                gt_ins_labels.append(self.inputs[ins_label])
            cate_label = 'cate_label{}'.format(i)
            if cate_label in self.inputs:
                gt_cate_labels.append(self.inputs[cate_label])
            grid_order = 'grid_order{}'.format(i)
            if grid_order in self.inputs:
                gt_grid_orders.append(self.inputs[grid_order])

        loss_solov2 = self.solov2_head.get_loss(
            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
            gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)
        loss.update(loss_solov2)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(
            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
            self.inputs['im_shape'], self.inputs['scale_factor'])
        outs = {
            "segm": seg_masks,
            "bbox_num": bbox_num,
            'cate_label': cate_labels,
            'cate_score': cate_scores
        }
        return outs


================================================
FILE: ppdet/modeling/architectures/sparse_rcnn.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ["SparseRCNN"]


@register
class SparseRCNN(BaseArch):
    __category__ = 'architecture'
    __inject__ = ["postprocess"]

    def __init__(self,
                 backbone,
                 neck,
                 head="SparsercnnHead",
                 postprocess="SparsePostProcess"):
        super(SparseRCNN, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.postprocess = postprocess

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'roi_input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)
        head_outs = self.head(fpn_feats, self.inputs["img_whwh"])

        if not self.training:
            bbox_pred, bbox_num = self.postprocess(
                head_outs["pred_logits"], head_outs["pred_boxes"],
                self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
            return bbox_pred, bbox_num
        else:
            return head_outs

    def get_loss(self):
        batch_gt_class = self.inputs["gt_class"]
        batch_gt_box = self.inputs["gt_bbox"]
        batch_whwh = self.inputs["img_whwh"]
        targets = []

        for i in range(len(batch_gt_class)):
            boxes = batch_gt_box[i]
            labels = batch_gt_class[i].squeeze(-1)
            img_whwh = batch_whwh[i]
            img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])
            targets.append({
                "boxes": boxes,
                "labels": labels,
                "img_whwh": img_whwh,
                "img_whwh_tgt": img_whwh_tgt
            })

        outputs = self._forward()
        loss_dict = self.head.get_loss(outputs, targets)
        acc = loss_dict["acc"]
        loss_dict.pop("acc")
        total_loss = sum(loss_dict.values())
        loss_dict.update({"loss": total_loss, "acc": acc})
        return loss_dict

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output


================================================
FILE: ppdet/modeling/architectures/ssd.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F

__all__ = ['SSD']


@register
class SSD(BaseArch):
    """
    Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325

    Args:
        backbone (nn.Layer): backbone instance
        ssd_head (nn.Layer): `SSDHead` instance
        post_process (object): `BBoxPostProcess` instance
    """

    __category__ = 'architecture'
    __inject__ = ['post_process']

    def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):
        super(SSD, self).__init__()
        self.backbone = backbone
        self.ssd_head = ssd_head
        self.post_process = post_process
        self.r34_backbone = r34_backbone
        if self.r34_backbone:
            from ppdet.modeling.backbones.resnet import ResNet
            assert isinstance(self.backbone, ResNet) and \
                   self.backbone.depth == 34, \
                "If you set r34_backbone=True, please use ResNet-34 as backbone."
            self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1]
            self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        # head
        kwargs = {'input_shape': backbone.out_shape}
        ssd_head = create(cfg['ssd_head'], **kwargs)

        return {
            'backbone': backbone,
            "ssd_head": ssd_head,
        }

    def _forward(self):
        # Backbone
        body_feats = self.backbone(self.inputs)

        # SSD Head
        if self.training:
            return self.ssd_head(body_feats, self.inputs['image'],
                                 self.inputs['gt_bbox'],
                                 self.inputs['gt_class'])
        else:
            preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
            bbox, bbox_num, nms_keep_idx = self.post_process(
                preds, anchors, self.inputs['im_shape'],
                self.inputs['scale_factor'])

            if self.use_extra_data:
                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
                """extra_data:{
                            'scores': predict scores,
                            'nms_keep_idx': bbox index before nms,
                           }
                           """
                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
                extra_data['scores'] = F.softmax(paddle.concat(
                    preds_logits, axis=1)).transpose([0, 2, 1])
                extra_data['logits'] = paddle.concat(
                    preds_logits, axis=1).transpose([0, 2, 1])
                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
                return bbox, bbox_num, extra_data
            else:
                return bbox, bbox_num

    def get_loss(self, ):
        return {"loss": self._forward()}

    def get_pred(self):
        if self.use_extra_data:
            bbox_pred, bbox_num, extra_data = self._forward()
            output = {
                "bbox": bbox_pred,
                "bbox_num": bbox_num,
                "extra_data": extra_data
            }
        else:
            bbox_pred, bbox_num = self._forward()
            output = {
                "bbox": bbox_pred,
                "bbox_num": bbox_num,
            }
        return output


================================================
FILE: ppdet/modeling/architectures/tood.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['TOOD']


@register
class TOOD(BaseArch):
    """
    TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755
    Args:
        backbone (nn.Layer): backbone instance
        neck (nn.Layer): 'FPN' instance
        head (nn.Layer): 'TOODHead' instance
    """

    __category__ = 'architecture'

    def __init__(self, backbone, neck, head):
        super(TOOD, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        fpn_feats = self.neck(body_feats)
        head_outs = self.head(fpn_feats)
        if not self.training:
            bboxes, bbox_num = self.head.post_process(
                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
            return bboxes, bbox_num
        else:
            loss = self.head.get_loss(head_outs, self.inputs)
            return loss

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output


================================================
FILE: ppdet/modeling/architectures/ttfnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['TTFNet']


@register
class TTFNet(BaseArch):
    """
    TTFNet network, see https://arxiv.org/abs/1909.00700

    Args:
        backbone (object): backbone instance
        neck (object): 'TTFFPN' instance
        ttf_head (object): 'TTFHead' instance
        post_process (object): 'BBoxPostProcess' instance
    """

    __category__ = 'architecture'
    __inject__ = ['post_process']

    def __init__(self,
                 backbone='DarkNet',
                 neck='TTFFPN',
                 ttf_head='TTFHead',
                 post_process='BBoxPostProcess'):
        super(TTFNet, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.ttf_head = ttf_head
        self.post_process = post_process

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        backbone = create(cfg['backbone'])

        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        kwargs = {'input_shape': neck.out_shape}
        ttf_head = create(cfg['ttf_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "ttf_head": ttf_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        body_feats = self.neck(body_feats)
        hm, wh = self.ttf_head(body_feats)
        if self.training:
            return hm, wh
        else:
            bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],
                                               self.inputs['scale_factor'])
            return bbox, bbox_num

    def get_loss(self, ):
        loss = {}
        heatmap = self.inputs['ttf_heatmap']
        box_target = self.inputs['ttf_box_target']
        reg_weight = self.inputs['ttf_reg_weight']
        hm, wh = self._forward()
        head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,
                                           reg_weight)
        loss.update(head_loss)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
        output = {
            "bbox": bbox_pred,
            "bbox_num": bbox_num,
        }
        return output


================================================
FILE: ppdet/modeling/architectures/yolo.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ..post_process import JDEBBoxPostProcess

__all__ = ['YOLOv3']
# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head


@register
class YOLOv3(BaseArch):
    __category__ = 'architecture'
    __shared__ = ['data_format']
    __inject__ = ['post_process']

    def __init__(self,
                 backbone='DarkNet',
                 neck='YOLOv3FPN',
                 yolo_head='YOLOv3Head',
                 post_process='BBoxPostProcess',
                 data_format='NCHW',
                 for_mot=False):
        """
        YOLOv3 network, see https://arxiv.org/abs/1804.02767

        Args:
            backbone (nn.Layer): backbone instance
            neck (nn.Layer): neck instance
            yolo_head (nn.Layer): anchor_head instance
            bbox_post_process (object): `BBoxPostProcess` instance
            data_format (str): data format, NCHW or NHWC
            for_mot (bool): whether return other features for multi-object tracking
                models, default False in pure object detection models.
        """
        super(YOLOv3, self).__init__(data_format=data_format)
        self.backbone = backbone
        self.neck = neck
        self.yolo_head = yolo_head
        self.post_process = post_process
        self.for_mot = for_mot
        self.return_idx = isinstance(post_process, JDEBBoxPostProcess)

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        # head
        kwargs = {'input_shape': neck.out_shape}
        yolo_head = create(cfg['yolo_head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "yolo_head": yolo_head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        if self.for_mot:
            neck_feats = self.neck(body_feats, self.for_mot)
        else:
            neck_feats = self.neck(body_feats)

        if isinstance(neck_feats, dict):
            assert self.for_mot == True
            emb_feats = neck_feats['emb_feats']
            neck_feats = neck_feats['yolo_feats']

        if self.training:
            yolo_losses = self.yolo_head(neck_feats, self.inputs)

            if self.for_mot:
                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
            else:
                return yolo_losses

        else:
            yolo_head_outs = self.yolo_head(neck_feats)

            if self.for_mot:
                # the detection part of JDE MOT model
                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
                    yolo_head_outs, self.yolo_head.mask_anchors)
                output = {
                    'bbox': bbox,
                    'bbox_num': bbox_num,
                    'boxes_idx': boxes_idx,
                    'nms_keep_idx': nms_keep_idx,
                    'emb_feats': emb_feats,
                }
            else:
                if self.return_idx:
                    # the detection part of JDE MOT model
                    _, bbox, bbox_num, nms_keep_idx = self.post_process(
                        yolo_head_outs, self.yolo_head.mask_anchors)
                elif self.post_process is not None:
                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
                    bbox, bbox_num, nms_keep_idx = self.post_process(
                        yolo_head_outs, self.yolo_head.mask_anchors,
                        self.inputs['im_shape'], self.inputs['scale_factor'])
                else:
                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
                        yolo_head_outs, self.inputs['scale_factor'])

                if self.use_extra_data:
                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
                    """extra_data:{
                                'scores': predict scores,
                                'nms_keep_idx': bbox index before nms,
                               }
                    """
                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
                    # Todo: get logits output
                    extra_data['nms_keep_idx'] = nms_keep_idx
                    # Todo support for mask_anchors yolo
                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
                else:
                    output = {'bbox': bbox, 'bbox_num': bbox_num}

            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/yolof.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

__all__ = ['YOLOF']


@register
class YOLOF(BaseArch):
    __category__ = 'architecture'

    def __init__(self,
                 backbone='ResNet',
                 neck='DilatedEncoder',
                 head='YOLOFHead',
                 for_mot=False):
        """
        YOLOF network, see https://arxiv.org/abs/2103.09460

        Args:
            backbone (nn.Layer): backbone instance
            neck (nn.Layer): DilatedEncoder instance
            head (nn.Layer): YOLOFHead instance
            for_mot (bool): whether return other features for multi-object tracking
                models, default False in pure object detection models.
        """
        super(YOLOF, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.for_mot = for_mot

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        # head
        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        body_feats = self.backbone(self.inputs)
        neck_feats = self.neck(body_feats, self.for_mot)

        if self.training:
            yolo_losses = self.head(neck_feats, self.inputs)
            return yolo_losses
        else:
            yolo_head_outs = self.head(neck_feats)
            bbox, bbox_num = self.head.post_process(yolo_head_outs,
                                                    self.inputs['im_shape'],
                                                    self.inputs['scale_factor'])
            output = {'bbox': bbox, 'bbox_num': bbox_num}
            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()


================================================
FILE: ppdet/modeling/architectures/yolox.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register, create
from .meta_arch import BaseArch

import random
import paddle
import paddle.nn.functional as F
import paddle.distributed as dist

__all__ = ['YOLOX']


@register
class YOLOX(BaseArch):
    """
    YOLOX network, see https://arxiv.org/abs/2107.08430

    Args:
        backbone (nn.Layer): backbone instance
        neck (nn.Layer): neck instance
        head (nn.Layer): head instance
        for_mot (bool): whether used for MOT or not
        input_size (list[int]): initial scale, will be reset by self._preprocess()
        size_stride (int): stride of the size range
        size_range (list[int]): multi-scale range for training
        random_interval (int): interval of iter to change self._input_size
    """
    __category__ = 'architecture'

    def __init__(self,
                 backbone='CSPDarkNet',
                 neck='YOLOCSPPAN',
                 head='YOLOXHead',
                 for_mot=False,
                 input_size=[640, 640],
                 size_stride=32,
                 size_range=[15, 25],
                 random_interval=10):
        super(YOLOX, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.for_mot = for_mot

        self.input_size = input_size
        self._input_size = paddle.to_tensor(input_size)
        self.size_stride = size_stride
        self.size_range = size_range
        self.random_interval = random_interval
        self._step = 0

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
        # backbone
        backbone = create(cfg['backbone'])

        # fpn
        kwargs = {'input_shape': backbone.out_shape}
        neck = create(cfg['neck'], **kwargs)

        # head
        kwargs = {'input_shape': neck.out_shape}
        head = create(cfg['head'], **kwargs)

        return {
            'backbone': backbone,
            'neck': neck,
            "head": head,
        }

    def _forward(self):
        if self.training:
            self._preprocess()
        body_feats = self.backbone(self.inputs)
        neck_feats = self.neck(body_feats, self.for_mot)

        if self.training:
            yolox_losses = self.head(neck_feats, self.inputs)
            yolox_losses.update({'size': self._input_size[0]})
            return yolox_losses
        else:
            head_outs = self.head(neck_feats)
            bbox, bbox_num = self.head.post_process(
                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
            return {'bbox': bbox, 'bbox_num': bbox_num}

    def get_loss(self):
        return self._forward()

    def get_pred(self):
        return self._forward()

    def _preprocess(self):
        # YOLOX multi-scale training, interpolate resize before inputs of the network.
        self._get_size()
        scale_y = self._input_size[0] / self.input_size[0]
        scale_x = self._input_size[1] / self.input_size[1]
        if scale_x != 1 or scale_y != 1:
            self.inputs['image'] = F.interpolate(
                self.inputs['image'],
                size=self._input_size,
                mode='bilinear',
                align_corners=False)
            gt_bboxes = self.inputs['gt_bbox']
            for i in range(len(gt_bboxes)):
                if len(gt_bboxes[i]) > 0:
                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
            self.inputs['gt_bbox'] = gt_bboxes

    def _get_size(self):
        # random_interval = 10 as default, every 10 iters to change self._input_size
        image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
        if self._step % self.random_interval == 0:
            size_factor = random.randint(*self.size_range)
            size = [
                self.size_stride * size_factor,
                self.size_stride * int(size_factor * image_ratio)
            ]
            self._input_size = paddle.to_tensor(size)
        self._step += 1


================================================
FILE: ppdet/modeling/assigners/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import utils
from . import task_aligned_assigner
from . import atss_assigner
from . import simota_assigner
from . import max_iou_assigner
from . import fcosr_assigner
from . import rotated_task_aligned_assigner
from . import task_aligned_assigner_cr
from . import uniform_assigner

from .utils import *
from .task_aligned_assigner import *
from .atss_assigner import *
from .simota_assigner import *
from .max_iou_assigner import *
from .fcosr_assigner import *
from .rotated_task_aligned_assigner import *
from .task_aligned_assigner_cr import *
from .uniform_assigner import *
from .hungarian_assigner import *
from .pose_utils import *


================================================
FILE: ppdet/modeling/assigners/atss_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ..bbox_utils import iou_similarity, batch_iou_similarity
from ..bbox_utils import bbox_center
from .utils import (check_points_inside_bboxes, compute_max_iou_anchor,
                    compute_max_iou_gt)

__all__ = ['ATSSAssigner']


@register
class ATSSAssigner(nn.Layer):
    """Bridging the Gap Between Anchor-based and Anchor-free Detection
     via Adaptive Training Sample Selection
    """
    __shared__ = ['num_classes']

    def __init__(self,
                 topk=9,
                 num_classes=80,
                 force_gt_matching=False,
                 eps=1e-9,
                 sm_use=False):
        super(ATSSAssigner, self).__init__()
        self.topk = topk
        self.num_classes = num_classes
        self.force_gt_matching = force_gt_matching
        self.eps = eps
        self.sm_use = sm_use

    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
                             pad_gt_mask):
        gt2anchor_distances_list = paddle.split(
            gt2anchor_distances, num_anchors_list, axis=-1)
        num_anchors_index = np.cumsum(num_anchors_list).tolist()
        num_anchors_index = [0, ] + num_anchors_index[:-1]
        is_in_topk_list = []
        topk_idxs_list = []
        for distances, anchors_index in zip(gt2anchor_distances_list,
                                            num_anchors_index):
            num_anchors = distances.shape[-1]
            _, topk_idxs = paddle.topk(
                distances, self.topk, axis=-1, largest=False)
            topk_idxs_list.append(topk_idxs + anchors_index)
            is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
                axis=-2).astype(gt2anchor_distances.dtype)
            is_in_topk_list.append(is_in_topk * pad_gt_mask)
        is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
        topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
        return is_in_topk_list, topk_idxs_list

    @paddle.no_grad()
    def forward(self,
                anchor_bboxes,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index,
                gt_scores=None,
                pred_bboxes=None):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

        The assignment is done in following steps
        1. compute iou between all bbox (bbox of all pyramid levels) and gt
        2. compute center distance between all bbox and gt
        3. on each pyramid level, for each gt, select k bbox whose center
           are closest to the gt center, so we total select k*l bbox as
           candidates for each gt
        4. get corresponding iou for the these candidates, and compute the
           mean and std, set mean + std as the iou threshold
        5. select these candidates whose iou are greater than or equal to
           the threshold as positive
        6. limit the positive sample's center in gt
        7. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
                    "xmin, xmax, ymin, ymax" format
            num_anchors_list (List): num of anchors in each level
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes,
                    shape(B, n, 1), if None, then it will initialize with one_hot label
            pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 4)
            assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
        """
        assert gt_labels.ndim == gt_bboxes.ndim and \
               gt_bboxes.ndim == 3

        num_anchors, _ = anchor_bboxes.shape
        batch_size, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = paddle.full(
                [batch_size, num_anchors], bg_index, dtype='int32')
            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
            assigned_scores = paddle.zeros(
                [batch_size, num_anchors, self.num_classes])
            return assigned_labels, assigned_bboxes, assigned_scores

        # 1. compute iou between gt and anchor bbox, [B, n, L]
        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
        ious = ious.reshape([batch_size, -1, num_anchors])

        # 2. compute center distance between all anchors and gt, [B, n, L]
        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
        anchor_centers = bbox_center(anchor_bboxes)
        gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \
            .norm(2, axis=-1).reshape([batch_size, -1, num_anchors])

        # 3. on each pyramid level, selecting topk closest candidates
        # based on the center distance, [B, n, L]
        is_in_topk, topk_idxs = self._gather_topk_pyramid(
            gt2anchor_distances, num_anchors_list, pad_gt_mask)

        # 4. get corresponding iou for the these candidates, and compute the
        # mean and std, 5. set mean + std as the iou threshold
        iou_candidates = ious * is_in_topk
        iou_threshold = paddle.index_sample(
            iou_candidates.flatten(stop_axis=-2),
            topk_idxs.flatten(stop_axis=-2))
        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
        iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
                        iou_threshold.std(axis=-1, keepdim=True)
        is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,
                                  paddle.zeros_like(is_in_topk))

        # 6. check the positive sample's center in gt, [B, n, L]
        if self.sm_use:
            is_in_gts = check_points_inside_bboxes(
                anchor_centers, gt_bboxes, sm_use=True)
        else:
            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts * pad_gt_mask

        # 7. if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected.
        mask_positive_sum = mask_positive.sum(axis=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (
                mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile(
                    [1, num_max_boxes, 1]).astype('bool')
            if self.sm_use:
                is_max_iou = compute_max_iou_anchor(ious * mask_positive)
            else:
                is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                         mask_positive)
            mask_positive_sum = mask_positive.sum(axis=-2)
        # 8. make sure every gt_bbox matches the anchor
        if self.force_gt_matching:
            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(
                [1, num_max_boxes, 1])
            mask_positive = paddle.where(mask_max_iou, is_max_iou,
                                         mask_positive)
            mask_positive_sum = mask_positive.sum(axis=-2)
        assigned_gt_index = mask_positive.argmax(axis=-2)

        # assigned target
        batch_ind = paddle.arange(
            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
        assigned_labels = paddle.gather(
            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = paddle.where(
            mask_positive_sum > 0, assigned_labels,
            paddle.full_like(assigned_labels, bg_index))

        assigned_bboxes = paddle.gather(
            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
        ind = list(range(self.num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = paddle.index_select(
            assigned_scores, paddle.to_tensor(ind), axis=-1)
        if pred_bboxes is not None:
            # assigned iou
            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
            ious = ious.max(axis=-2).unsqueeze(-1)
            assigned_scores *= ious
        elif gt_scores is not None:
            gather_scores = paddle.gather(
                gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
            gather_scores = gather_scores.reshape([batch_size, num_anchors])
            gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
                                         paddle.zeros_like(gather_scores))
            assigned_scores *= gather_scores.unsqueeze(-1)

        return assigned_labels, assigned_bboxes, assigned_scores


================================================
FILE: ppdet/modeling/assigners/clrnet_assigner.py
================================================
import paddle
import paddle.nn.functional as F
from ppdet.modeling.losses.clrnet_line_iou_loss import line_iou


def distance_cost(predictions, targets, img_w):
    """
    repeat predictions and targets to generate all combinations
    use the abs distance as the new distance cost
    """
    num_priors = predictions.shape[0]
    num_targets = targets.shape[0]
    predictions = paddle.repeat_interleave(
        predictions, num_targets, axis=0)[..., 6:]
    targets = paddle.concat(x=num_priors * [targets])[..., 6:]
    invalid_masks = (targets < 0) | (targets >= img_w)
    lengths = (~invalid_masks).sum(axis=1)
    distances = paddle.abs(x=targets - predictions)
    distances[invalid_masks] = 0.0
    distances = distances.sum(axis=1) / (lengths.cast("float32") + 1e-09)
    distances = distances.reshape([num_priors, num_targets])
    return distances


def focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12):
    """
    Args:
        cls_pred (Tensor): Predicted classification logits, shape
            [num_query, num_class].
        gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

    Returns:
        torch.Tensor: cls_cost value
    """
    cls_pred = F.sigmoid(cls_pred)
    neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma)
    pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma)
    cls_cost = pos_cost.index_select(
        gt_labels, axis=1) - neg_cost.index_select(
            gt_labels, axis=1)
    return cls_cost


def dynamic_k_assign(cost, pair_wise_ious):
    """
    Assign grouth truths with priors dynamically.

    Args:
        cost: the assign cost.
        pair_wise_ious: iou of grouth truth and priors.

    Returns:
        prior_idx: the index of assigned prior.
        gt_idx: the corresponding ground truth index.
    """
    matching_matrix = paddle.zeros_like(cost)
    ious_matrix = pair_wise_ious
    ious_matrix[ious_matrix < 0] = 0.0
    n_candidate_k = 4
    topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0)
    dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast("int32"), min=1)
    num_gt = cost.shape[1]

    for gt_idx in range(num_gt):
        _, pos_idx = paddle.topk(
            x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
        matching_matrix[pos_idx, gt_idx] = 1.0
    del topk_ious, dynamic_ks, pos_idx
    matched_gt = matching_matrix.sum(axis=1)

    if (matched_gt > 1).sum() > 0:
        matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0]
        cost_argmin = paddle.argmin(
            cost.index_select(matched_gt_indices), axis=1)
        matching_matrix[matched_gt_indices][0] *= 0.0
        matching_matrix[matched_gt_indices, cost_argmin] = 1.0

    prior_idx = matching_matrix.sum(axis=1).nonzero()
    gt_idx = matching_matrix[prior_idx].argmax(axis=-1)
    return prior_idx.flatten(), gt_idx.flatten()


def cdist_paddle(x1, x2, p=2):
    assert x1.shape[1] == x2.shape[1]
    B, M = x1.shape
    # if p == np.inf:
    #     dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1)
    if p == 1:
        dist = paddle.sum(
            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1)
    else:
        dist = paddle.pow(paddle.sum(paddle.pow(
            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p),
                                     axis=-1),
                          1 / p)
    return dist


def assign(predictions,
           targets,
           img_w,
           img_h,
           distance_cost_weight=3.0,
           cls_cost_weight=1.0):
    """
    computes dynamicly matching based on the cost, including cls cost and lane similarity cost
    Args:
        predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78)
        targets (Tensor): lane targets, shape: (num_targets, 78)
    return:
        matched_row_inds (Tensor): matched predictions, shape: (num_targets)
        matched_col_inds (Tensor): matched targets, shape: (num_targets)
    """
    predictions = predictions.detach().clone()
    predictions[:, 3] *= img_w - 1
    predictions[:, 6:] *= img_w - 1

    targets = targets.detach().clone()
    distances_score = distance_cost(predictions, targets, img_w)
    distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01

    cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64'))

    num_priors = predictions.shape[0]
    num_targets = targets.shape[0]
    target_start_xys = targets[:, 2:4]
    target_start_xys[..., 0] *= (img_h - 1)
    prediction_start_xys = predictions[:, 2:4]
    prediction_start_xys[..., 0] *= (img_h - 1)
    start_xys_score = cdist_paddle(
        prediction_start_xys, target_start_xys,
        p=2).reshape([num_priors, num_targets])

    start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01

    target_thetas = targets[:, 4].unsqueeze(axis=-1)
    theta_score = cdist_paddle(
        predictions[:, 4].unsqueeze(axis=-1), target_thetas,
        p=1).reshape([num_priors, num_targets]) * 180
    theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01

    cost = -(distances_score * start_xys_score * theta_score
             )**2 * distance_cost_weight + cls_score * cls_cost_weight
    iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False)

    matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou)
    return matched_row_inds, matched_col_inds


================================================
FILE: ppdet/modeling/assigners/fcosr_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather

__all__ = ['FCOSRAssigner']

EPS = 1e-9


@register
class FCOSRAssigner(nn.Layer):
    """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details

    1. compute normalized gaussian distribution score and refined gaussian distribution score
    2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold
    3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.
        i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.
        ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map
    4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score
    """
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 factor=12,
                 threshold=0.23,
                 boundary=[[-1, 128], [128, 320], [320, 10000]],
                 score_type='iou'):
        super(FCOSRAssigner, self).__init__()
        self.num_classes = num_classes
        self.factor = factor
        self.threshold = threshold
        self.boundary = [
            paddle.to_tensor(
                l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary
        ]
        self.score_type = score_type

    def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):
        # projecting points to coordinate system defined by each rbox
        # [B, N, 4, 2] -> 4 * [B, N, 1, 2]
        a, b, c, d = gt_polys.split(4, axis=2)
        # [1, L, 2] -> [1, 1, L, 2]
        points = points.unsqueeze(0)
        ab = b - a
        ad = d - a
        # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]
        xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)
        # [B, N, 2] -> [B, N, 1, 2]
        xy = xy.unsqueeze(2)
        # vector of points to center [B, N, L, 2]
        vec = points - xy
        # <ab, vec> = |ab| * |vec| * cos(theta) [B, N, L]
        vec_dot_ab = paddle.sum(vec * ab, axis=-1)
        # <ad, vec> = |ad| * |vec| * cos(theta) [B, N, L]
        vec_dot_ad = paddle.sum(vec * ad, axis=-1)
        # norm_ab [B, N, L]
        norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()
        # norm_ad [B, N, L]
        norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()
        # min(h, w), [B, N, 1]
        min_edge = paddle.min(wh, axis=-1, keepdim=True)
        # delta_x, delta_y [B, N, L]
        delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)
        delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)
        # score [B, N, L]
        norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))

        # simplified calculation
        sigma = min_edge / self.factor
        refined_score = norm_score / (2 * np.pi * sigma + EPS)
        return norm_score, refined_score

    def get_rotated_inside_mask(self, points, gt_polys, scores):
        inside_mask = check_points_in_polys(points, gt_polys)
        center_mask = scores >= self.threshold
        return (inside_mask & center_mask).cast(paddle.float32)

    def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,
                              regress_range):
        # [1, L, 2] -> [1, 1, L, 2]
        points = points.unsqueeze(0)
        # [B, n, 4] -> [B, n, 1, 4]
        x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)
        # [B, n, L, 2]
        lt = points - x1y1
        rb = x2y2 - points
        # [B, n, L, 4]
        ltrb = paddle.concat([lt, rb], axis=-1)
        # [B, n, L, 4] -> [B, n, L]
        inside_mask = paddle.min(ltrb, axis=-1) > EPS
        # regress_range [1, L, 2] -> [1, 1, L, 2]
        regress_range = regress_range.unsqueeze(0)
        # stride_tensor [1, L, 1] -> [1, 1, L]
        stride_tensor = stride_tensor.transpose((0, 2, 1))
        # fcos range
        # [B, n, L, 4] -> [B, n, L]
        ltrb_max = paddle.max(ltrb, axis=-1)
        # [1, 1, L, 2] -> [1, 1, L]
        low, high = regress_range[..., 0], regress_range[..., 1]
        # [B, n, L]
        regress_mask = (ltrb_max >= low) & (ltrb_max <= high)
        # mask for rotated
        # [B, n, 1]
        min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)
        # [B, n , L]
        rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)
        mask = inside_mask & (regress_mask | rotated_mask)
        return mask.cast(paddle.float32)

    @paddle.no_grad()
    def forward(self,
                anchor_points,
                stride_tensor,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                gt_rboxes,
                pad_gt_mask,
                bg_index,
                pred_rboxes=None):
        r"""

        Args:
            anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),
                    "x, y" format
            stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)
            num_anchors_list (List): num of anchors in each level
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_rboxes (Tensor): (B, L, 5)
            assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious
        """

        _, num_anchors, _ = anchor_points.shape
        batch_size, num_max_boxes, _ = gt_rboxes.shape
        if num_max_boxes == 0:
            assigned_labels = paddle.full(
                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
            assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])
            assigned_scores = paddle.zeros(
                [batch_size, num_anchors, self.num_classes])
            return assigned_labels, assigned_rboxes, assigned_scores

        # get normalized gaussian distribution score and refined distribution score
        gt_polys = box2corners(gt_rboxes)
        score, refined_score = self.get_gaussian_distribution_score(
            anchor_points, gt_rboxes, gt_polys)
        inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,
                                                   score)
        regress_ranges = []
        for num, bound in zip(num_anchors_list, self.boundary):
            regress_ranges.append(bound.tile((1, num, 1)))
        regress_ranges = paddle.concat(regress_ranges, axis=1)
        regress_mask = self.get_inside_range_mask(
            anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)
        # [B, n, L]
        mask_positive = inside_mask * regress_mask * pad_gt_mask
        refined_score = refined_score * mask_positive - (1. - mask_positive)

        argmax_refined_score = refined_score.argmax(axis=-2)
        max_refined_score = refined_score.max(axis=-2)
        assigned_gt_index = argmax_refined_score

        # assigned target
        batch_ind = paddle.arange(
            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
        assigned_labels = paddle.gather(
            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = paddle.where(
            max_refined_score > 0, assigned_labels,
            paddle.full_like(assigned_labels, bg_index))

        assigned_rboxes = paddle.gather(
            gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
        assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])

        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
        ind = list(range(self.num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = paddle.index_select(
            assigned_scores, paddle.to_tensor(ind), axis=-1)

        if self.score_type == 'gaussian':
            selected_scores = paddle_gather(
                score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)
            assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)
        elif self.score_type == 'iou':
            assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'
            from ext_op import matched_rbox_iou
            b, l = pred_rboxes.shape[:2]
            iou_score = matched_rbox_iou(
                pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(
                    (-1, 5))).reshape((b, l, 1))
            assigned_scores = assigned_scores * iou_score

        return assigned_labels, assigned_rboxes, assigned_scores


================================================
FILE: ppdet/modeling/assigners/hungarian_assigner.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
    from scipy.optimize import linear_sum_assignment
except ImportError:
    linear_sum_assignment = None

import paddle

from ppdet.core.workspace import register

__all__ = ['PoseHungarianAssigner', 'PseudoSampler']


class AssignResult:
    """Stores assignments between predicted and truth boxes.

    Attributes:
        num_gts (int): the number of truth boxes considered when computing this
            assignment

        gt_inds (LongTensor): for each predicted box indicates the 1-based
            index of the assigned truth box. 0 means unassigned and -1 means
            ignore.

        max_overlaps (FloatTensor): the iou between the predicted box and its
            assigned truth box.

        labels (None | LongTensor): If specified, for each predicted box
            indicates the category label of the assigned truth box.
    """

    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
        self.num_gts = num_gts
        self.gt_inds = gt_inds
        self.max_overlaps = max_overlaps
        self.labels = labels
        # Interface for possible user-defined properties
        self._extra_properties = {}

    @property
    def num_preds(self):
        """int: the number of predictions in this assignment"""
        return len(self.gt_inds)

    def set_extra_property(self, key, value):
        """Set user-defined new property."""
        assert key not in self.info
        self._extra_properties[key] = value

    def get_extra_property(self, key):
        """Get user-defined property."""
        return self._extra_properties.get(key, None)

    @property
    def info(self):
        """dict: a dictionary of info about the object"""
        basic_info = {
            'num_gts': self.num_gts,
            'num_preds': self.num_preds,
            'gt_inds': self.gt_inds,
            'max_overlaps': self.max_overlaps,
            'labels': self.labels,
        }
        basic_info.update(self._extra_properties)
        return basic_info


@register
class PoseHungarianAssigner:
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classification cost, regression L1 cost and regression oks cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt.
    - positive integer: positive sample, index (1-based) of assigned gt.

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        kpt_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        oks_weight (int | float, optional): The scale factor for regression
            oks cost. Default 1.0.
    """
    __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']

    def __init__(self,
                 cls_cost='ClassificationCost',
                 kpt_cost='KptL1Cost',
                 oks_cost='OksCost'):
        self.cls_cost = cls_cost
        self.kpt_cost = kpt_cost
        self.oks_cost = oks_cost

    def assign(self,
               cls_pred,
               kpt_pred,
               gt_labels,
               gt_keypoints,
               gt_areas,
               img_meta,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
                [num_query, K*2].
            gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).
            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
                coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
                    p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].
            gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]
        if not gt_keypoints.astype('bool').any():
            num_gts = 0

        # 1. assign -1 by default
        assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64")
        assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64")
        if num_gts == 0 or num_kpts == 0:
            # No ground truth or keypoints, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)
        img_h, img_w, _ = img_meta['img_shape']
        factor = paddle.to_tensor(
            [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(
                (1, -1))

        # 2. compute the weighted costs
        # classification cost
        cls_cost = self.cls_cost(cls_pred, gt_labels)

        # keypoint regression L1 cost
        gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,
                                                     3))
        valid_kpt_flag = gt_keypoints_reshape[..., -1]
        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
                                                          2))
        normalize_gt_keypoints = gt_keypoints_reshape[
            ..., :2] / factor[:, :2].unsqueeze(0)
        kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,
                                 valid_kpt_flag)
        # keypoint OKS cost
        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
                                                          2))
        kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)
        oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],
                                 valid_kpt_flag, gt_areas)
        # weighted sum of above three costs
        cost = cls_cost + kpt_cost + oks_cost

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        matched_row_inds = paddle.to_tensor(matched_row_inds)
        matched_col_inds = paddle.to_tensor(matched_col_inds)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][
            ..., 0].astype("int64")
        return AssignResult(
            num_gts, assigned_gt_inds, None, labels=assigned_labels)


class SamplingResult:
    """Bbox sampling result.
    """

    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
                 gt_flags):
        self.pos_inds = pos_inds
        self.neg_inds = neg_inds
        if pos_inds.size > 0:
            self.pos_bboxes = bboxes[pos_inds]
            self.neg_bboxes = bboxes[neg_inds]
            self.pos_is_gt = gt_flags[pos_inds]

            self.num_gts = gt_bboxes.shape[0]
            self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

            if gt_bboxes.numel() == 0:
                # hack for index error case
                assert self.pos_assigned_gt_inds.numel() == 0
                self.pos_gt_bboxes = paddle.zeros(
                    gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))
            else:
                if len(gt_bboxes.shape) < 2:
                    gt_bboxes = gt_bboxes.reshape((-1, 4))

                self.pos_gt_bboxes = paddle.index_select(
                    gt_bboxes,
                    self.pos_assigned_gt_inds.astype('int64'),
                    axis=0)

            if assign_result.labels is not None:
                self.pos_gt_labels = assign_result.labels[pos_inds]
            else:
                self.pos_gt_labels = None

    @property
    def bboxes(self):
        """paddle.Tensor: concatenated positive and negative boxes"""
        return paddle.concat([self.pos_bboxes, self.neg_bboxes])

    def __nice__(self):
        data = self.info.copy()
        data['pos_bboxes'] = data.pop('pos_bboxes').shape
        data['neg_bboxes'] = data.pop('neg_bboxes').shape
        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
        body = '    ' + ',\n    '.join(parts)
        return '{\n' + body + '\n}'

    @property
    def info(self):
        """Returns a dictionary of info about the object."""
        return {
            'pos_inds': self.pos_inds,
            'neg_inds': self.neg_inds,
            'pos_bboxes': self.pos_bboxes,
            'neg_bboxes': self.neg_bboxes,
            'pos_is_gt': self.pos_is_gt,
            'num_gts': self.num_gts,
            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
        }


@register
class PseudoSampler:
    """A pseudo sampler that does not do sampling actually."""

    def __init__(self, **kwargs):
        pass

    def _sample_pos(self, **kwargs):
        """Sample positive samples."""
        raise NotImplementedError

    def _sample_neg(self, **kwargs):
        """Sample negative samples."""
        raise NotImplementedError

    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
        """Directly returns the positive and negative indices  of samples.

        Args:
            assign_result (:obj:`AssignResult`): Assigned results
            bboxes (paddle.Tensor): Bounding boxes
            gt_bboxes (paddle.Tensor): Ground truth boxes

        Returns:
            :obj:`SamplingResult`: sampler results
        """
        pos_inds = paddle.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
        neg_inds = paddle.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
        gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')
        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
                                         assign_result, gt_flags)
        return sampling_result


================================================
FILE: ppdet/modeling/assigners/max_iou_assigner.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ppdet.core.workspace import register
from ppdet.modeling.proposal_generator.target import label_box

__all__ = ['MaxIoUAssigner']

@register
class MaxIoUAssigner(object):
    """a standard bbox assigner based on max IoU, use ppdet's label_box 
    as backend.
    Args:
        positive_overlap (float): threshold for defining positive samples 
        negative_overlap (float): threshold for denining negative samples
        allow_low_quality (bool): whether to lower IoU thr if a GT poorly
            overlaps with candidate bboxes
    """
    def __init__(self,
                 positive_overlap,
                 negative_overlap,
                 allow_low_quality=True):
        self.positive_overlap = positive_overlap
        self.negative_overlap = negative_overlap
        self.allow_low_quality = allow_low_quality

    def __call__(self, bboxes, gt_bboxes):
        matches, match_labels = label_box(
            bboxes,
            gt_bboxes,
            positive_overlap=self.positive_overlap,
            negative_overlap=self.negative_overlap,
            allow_low_quality=self.allow_low_quality,
            ignore_thresh=-1,
            is_crowd=None,
            assign_on_cpu=False)
        return matches, match_labels


================================================
FILE: ppdet/modeling/assigners/pose_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.nn.functional as F

from ppdet.core.workspace import register

__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']


def masked_fill(x, mask, value):
    y = paddle.full(x.shape, value, x.dtype)
    return paddle.where(mask, y, x)


@register
class KptL1Cost(object):
    """KptL1Cost.

    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py

    Args:
        weight (int | float, optional): loss_weight.
    """

    def __init__(self, weight=1.0):
        self.weight = weight

    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
        """
        Args:
            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
                [num_query, K, 2].
            gt_keypoints (Tensor): Ground truth keypoints with normalized
                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
                Shape [num_gt, K].

        Returns:
            paddle.Tensor: kpt_cost value with weight.
        """
        kpt_cost = []
        for i in range(len(gt_keypoints)):
            if gt_keypoints[i].size == 0:
                kpt_cost.append(kpt_pred.sum() * 0)
            kpt_pred_tmp = kpt_pred.clone()
            valid_flag = valid_kpt_flag[i] > 0
            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
                kpt_pred_tmp)
            if not valid_flag_expand.all():
                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
            cost = F.pairwise_distance(
                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
                gt_keypoints[i].reshape((-1, )).unsqueeze(0),
                p=1,
                keepdim=True)
            avg_factor = paddle.clip(
                valid_flag.astype('float32').sum() * 2, 1.0)
            cost = cost / avg_factor
            kpt_cost.append(cost)
        kpt_cost = paddle.concat(kpt_cost, axis=1)
        return kpt_cost * self.weight


@register
class OksCost(object):
    """OksCost.

    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py

    Args:
        num_keypoints (int): number of keypoints
        weight (int | float, optional): loss_weight.
    """

    def __init__(self, num_keypoints=17, weight=1.0):
        self.weight = weight
        if num_keypoints == 17:
            self.sigmas = np.array(
                [
                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
                    1.07, .87, .87, .89, .89
                ],
                dtype=np.float32) / 10.0
        elif num_keypoints == 14:
            self.sigmas = np.array(
                [
                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
                    .89, .79, .79
                ],
                dtype=np.float32) / 10.0
        else:
            raise ValueError(f'Unsupported keypoints number {num_keypoints}')

    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
        """
        Args:
            kpt_pred (Tensor): Predicted keypoints with unnormalized
                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
                Shape [num_gt, K].
            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].

        Returns:
            paddle.Tensor: oks_cost value with weight.
        """
        sigmas = paddle.to_tensor(self.sigmas)
        variances = (sigmas * 2)**2

        oks_cost = []
        assert len(gt_keypoints) == len(gt_areas)
        for i in range(len(gt_keypoints)):
            if gt_keypoints[i].size == 0:
                oks_cost.append(kpt_pred.sum() * 0)
            squared_distance = \
                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
            vis_flag = (valid_kpt_flag[i] > 0).astype('int')
            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
            num_vis_kpt = vis_ind.shape[0]
            # assert num_vis_kpt > 0
            if num_vis_kpt == 0:
                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
                continue
            area = gt_areas[i]

            squared_distance0 = squared_distance / (area * variances * 2)
            squared_distance0 = paddle.index_select(
                squared_distance0, vis_ind, axis=1)
            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
                                                                   keepdim=True)
            oks = squared_distance1 / num_vis_kpt
            # The 1 is a constant that doesn't change the matching, so omitted.
            oks_cost.append(-oks)
        oks_cost = paddle.concat(oks_cost, axis=1)
        return oks_cost * self.weight


@register
class ClassificationCost:
    """ClsSoftmaxCost.

     Args:
         weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1.):
        self.weight = weight

    def __call__(self, cls_pred, gt_labels):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                (num_query, num_class).
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

        Returns:
            paddle.Tensor: cls_cost value with weight
        """
        # Following the official DETR repo, contrary to the loss that
        # NLL is used, we approximate it in 1 - cls_score[gt_label].
        # The 1 is a constant that doesn't change the matching,
        # so it can be omitted.
        cls_score = cls_pred.softmax(-1)
        cls_cost = -cls_score[:, gt_labels]
        return cls_cost * self.weight


@register
class FocalLossCost:
    """FocalLossCost.

     Args:
         weight (int | float, optional): loss_weight
         alpha (int | float, optional): focal_loss alpha
         gamma (int | float, optional): focal_loss gamma
         eps (float, optional): default 1e-12
         binary_input (bool, optional): Whether the input is binary,
            default False.
    """

    def __init__(self,
                 weight=1.,
                 alpha=0.25,
                 gamma=2,
                 eps=1e-12,
                 binary_input=False):
        self.weight = weight
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps
        self.binary_input = binary_input

    def _focal_loss_cost(self, cls_pred, gt_labels):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                (num_query, num_class).
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

        Returns:
            paddle.Tensor: cls_cost value with weight
        """
        if gt_labels.size == 0:
            return cls_pred.sum() * 0
        cls_pred = F.sigmoid(cls_pred)
        neg_cost = -(1 - cls_pred + self.eps).log() * (
            1 - self.alpha) * cls_pred.pow(self.gamma)
        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
            1 - cls_pred).pow(self.gamma)

        cls_cost = paddle.index_select(
            pos_cost, gt_labels, axis=1) - paddle.index_select(
                neg_cost, gt_labels, axis=1)
        return cls_cost * self.weight

    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
        """
        Args:
            cls_pred (Tensor): Predicted classfication logits
                in shape (num_query, d1, ..., dn), dtype=paddle.float32.
            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
                dtype=paddle.long. Labels should be binary.

        Returns:
            Tensor: Focal cost matrix with weight in shape\
                (num_query, num_gt).
        """
        cls_pred = cls_pred.flatten(1)
        gt_labels = gt_labels.flatten(1).float()
        n = cls_pred.shape[1]
        cls_pred = F.sigmoid(cls_pred)
        neg_cost = -(1 - cls_pred + self.eps).log() * (
            1 - self.alpha) * cls_pred.pow(self.gamma)
        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
            1 - cls_pred).pow(self.gamma)

        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
        return cls_cost / n * self.weight

    def __call__(self, cls_pred, gt_labels):
        """
        Args:
            cls_pred (Tensor): Predicted classfication logits.
            gt_labels (Tensor)): Labels.

        Returns:
            Tensor: Focal cost matrix with weight in shape\
                (num_query, num_gt).
        """
        if self.binary_input:
            return self._mask_focal_loss_cost(cls_pred, gt_labels)
        else:
            return self._focal_loss_cost(cls_pred, gt_labels)


================================================
FILE: ppdet/modeling/assigners/rotated_task_aligned_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes
from .utils import gather_topk_anchors, compute_max_iou_anchor

__all__ = ['RotatedTaskAlignedAssigner']


@register
class RotatedTaskAlignedAssigner(nn.Layer):
    """TOOD: Task-aligned One-stage Object Detection
    """

    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
        super(RotatedTaskAlignedAssigner, self).__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.eps = eps

    @paddle.no_grad()
    def forward(self,
                pred_scores,
                pred_bboxes,
                anchor_points,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index,
                gt_scores=None):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)
            anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format
            num_anchors_list (List): num of anchors in each level, shape(L)
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 5)
            assigned_scores (Tensor): (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and \
               gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = paddle.full(
                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])
            assigned_scores = paddle.zeros(
                [batch_size, num_anchors, num_classes])
            return assigned_labels, assigned_bboxes, assigned_scores

        # compute iou between gt and pred bbox, [B, n, L]
        ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)
        ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)
        ious.stop_gradient = True
        # gather pred bboxes class score
        pred_scores = pred_scores.transpose([0, 2, 1])
        batch_ind = paddle.arange(
            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
        gt_labels_ind = paddle.stack(
            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
            axis=-1)
        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
            self.beta)

        # check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)

        # select topk largest alignment metrics pred bbox as candidates
        # for each gt, [B, n, L]
        is_in_topk = gather_topk_anchors(
            alignment_metrics * is_in_gts.astype(alignment_metrics.dtype), self.topk, topk_mask=pad_gt_mask)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts.astype(is_in_topk.dtype) * pad_gt_mask

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(axis=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
                [1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                         mask_positive)
            mask_positive_sum = mask_positive.sum(axis=-2)
        assigned_gt_index = mask_positive.argmax(axis=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
        assigned_labels = paddle.gather(
            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = paddle.where(
            mask_positive_sum > 0, assigned_labels,
            paddle.full_like(assigned_labels, bg_index))

        assigned_bboxes = paddle.gather(
            gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])

        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = paddle.index_select(
            assigned_scores, paddle.to_tensor(ind), axis=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
                                                           keepdim=True)
        alignment_metrics = alignment_metrics / (
            max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        assigned_bboxes.stop_gradient = True
        assigned_scores.stop_gradient = True
        assigned_labels.stop_gradient = True
        return assigned_labels, assigned_bboxes, assigned_scores


================================================
FILE: ppdet/modeling/assigners/simota_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py

import paddle
import numpy as np
import paddle.nn.functional as F

from ppdet.modeling.losses.varifocal_loss import varifocal_loss
from ppdet.modeling.bbox_utils import batch_bbox_overlaps
from ppdet.core.workspace import register


@register
class SimOTAAssigner(object):
    """Computes matching between predictions and ground truth.
    Args:
        center_radius (int | float, optional): Ground truth center size
            to judge whether a prior is in center. Default 2.5.
        candidate_topk (int, optional): The candidate top-k which used to
            get top-k ious to calculate dynamic-k. Default 10.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 3.0.
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        num_classes (int): The num_classes of dataset.
        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
    """
    __shared__ = ['num_classes']

    def __init__(self,
                 center_radius=2.5,
                 candidate_topk=10,
                 iou_weight=3.0,
                 cls_weight=1.0,
                 num_classes=80,
                 use_vfl=True):
        self.center_radius = center_radius
        self.candidate_topk = candidate_topk
        self.iou_weight = iou_weight
        self.cls_weight = cls_weight
        self.num_classes = num_classes
        self.use_vfl = use_vfl

    def get_in_gt_and_in_center_info(self, flatten_center_and_stride,
                                     gt_bboxes):
        num_gt = gt_bboxes.shape[0]

        flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(
            [1, num_gt])
        flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(
            [1, num_gt])
        flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(
            [1, num_gt])
        flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(
            [1, num_gt])

        # is prior centers in gt bboxes, shape: [n_center, n_gt]
        l_ = flatten_x - gt_bboxes[:, 0]
        t_ = flatten_y - gt_bboxes[:, 1]
        r_ = gt_bboxes[:, 2] - flatten_x
        b_ = gt_bboxes[:, 3] - flatten_y

        deltas = paddle.stack([l_, t_, r_, b_], axis=1)
        is_in_gts = deltas.min(axis=1) > 0
        is_in_gts_all = is_in_gts.sum(axis=1) > 0

        # is prior centers in gt centers
        gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
        gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
        ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x
        ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y
        ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x
        ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y

        cl_ = flatten_x - ct_bound_l
        ct_ = flatten_y - ct_bound_t
        cr_ = ct_bound_r - flatten_x
        cb_ = ct_bound_b - flatten_y

        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
        is_in_cts = ct_deltas.min(axis=1) > 0
        is_in_cts_all = is_in_cts.sum(axis=1) > 0

        # in any of gts or gt centers, shape: [n_center]
        is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,
                                                     is_in_cts_all)

        is_in_gts_or_centers_all_inds = paddle.nonzero(
            is_in_gts_or_centers_all).squeeze(1)

        # both in gts and gt centers, shape: [num_fg, num_gt]
        is_in_gts_and_centers = paddle.logical_and(
            paddle.gather(
                is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,
                axis=0).cast('bool'),
            paddle.gather(
                is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,
                axis=0).cast('bool'))
        return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers

    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
        match_matrix = np.zeros_like(cost_matrix.numpy())
        # select candidate topk ious for dynamic-k calculation
        topk_ious, _ = paddle.topk(
            pairwise_ious,
            min(self.candidate_topk, pairwise_ious.shape[0]),
            axis=0)
        # calculate dynamic k for each gt
        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
        for gt_idx in range(num_gt):
            _, pos_idx = paddle.topk(
                cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
            match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0

        del topk_ious, dynamic_ks, pos_idx

        # match points more than two gts
        extra_match_gts_mask = match_matrix.sum(1) > 1
        if extra_match_gts_mask.sum() > 0:
            cost_matrix = cost_matrix.numpy()
            cost_argmin = np.argmin(
                cost_matrix[extra_match_gts_mask, :], axis=1)
            match_matrix[extra_match_gts_mask, :] *= 0.0
            match_matrix[extra_match_gts_mask, cost_argmin] = 1.0
        # get foreground mask
        match_fg_mask_inmatrix = match_matrix.sum(1) > 0
        match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)

        return match_gt_inds_to_fg, match_fg_mask_inmatrix

    def get_sample(self, assign_gt_inds, gt_bboxes):
        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1

        if gt_bboxes.size == 0:
            # hack for index error case
            assert pos_assigned_gt_inds.size == 0
            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
        else:
            if len(gt_bboxes.shape) < 2:
                gt_bboxes = gt_bboxes.resize(-1, 4)
            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

    def __call__(self,
                 flatten_cls_pred_scores,
                 flatten_center_and_stride,
                 flatten_bboxes,
                 gt_bboxes,
                 gt_labels,
                 eps=1e-7):
        """Assign gt to priors using SimOTA.
        TODO: add comment.
        Returns:
            assign_result: The assigned result.
        """
        num_gt = gt_bboxes.shape[0]
        num_bboxes = flatten_bboxes.shape[0]

        if num_gt == 0 or num_bboxes == 0:
            # No ground truth or boxes
            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
            label_weight = np.ones([num_bboxes], dtype=np.float32)
            bbox_target = np.zeros_like(flatten_center_and_stride)
            return 0, label, label_weight, bbox_target

        is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
            flatten_center_and_stride, gt_bboxes)

        # bboxes and scores to calculate matrix
        valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]
        valid_cls_pred_scores = flatten_cls_pred_scores[
            is_in_gts_or_centers_all_inds]
        num_valid_bboxes = valid_flatten_bboxes.shape[0]

        pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,
                                            gt_bboxes)  # [num_points,num_gts]
        if self.use_vfl:
            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
                [num_valid_bboxes, 1]).reshape([-1])
            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
                [1, num_gt, 1]).reshape([-1, self.num_classes])
            vfl_score = np.zeros(valid_pred_scores.shape)
            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
            )] = pairwise_ious.reshape([-1])
            vfl_score = paddle.to_tensor(vfl_score)
            losses_vfl = varifocal_loss(
                valid_pred_scores, vfl_score,
                use_sigmoid=False).reshape([num_valid_bboxes, num_gt])
            losses_giou = batch_bbox_overlaps(
                valid_flatten_bboxes, gt_bboxes, mode='giou')
            cost_matrix = (
                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
                100000000)
        else:
            iou_cost = -paddle.log(pairwise_ious + eps)
            gt_onehot_label = (F.one_hot(
                gt_labels.squeeze(-1).cast(paddle.int64),
                flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)
                               .tile([num_valid_bboxes, 1, 1]))

            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
                [1, num_gt, 1])
            cls_cost = F.binary_cross_entropy(
                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)

            cost_matrix = (
                cls_cost * self.cls_weight + iou_cost * self.iou_weight +
                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
                100000000)

        match_gt_inds_to_fg, match_fg_mask_inmatrix = \
            self.dynamic_k_matching(
                cost_matrix, pairwise_ious, num_gt)

        # sample and assign results
        assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)
        match_fg_mask_inall = np.zeros_like(assigned_gt_inds)
        match_fg_mask_inall[is_in_gts_or_centers_all.numpy(
        )] = match_fg_mask_inmatrix

        assigned_gt_inds[match_fg_mask_inall.astype(
            np.bool_)] = match_gt_inds_to_fg + 1

        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
            = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())

        bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype))
        bbox_weight = np.zeros_like(bbox_target)
        label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
        label_weight = np.zeros([num_bboxes], dtype=np.float32)

        if len(pos_inds) > 0:
            gt_labels = gt_labels.numpy()
            pos_bbox_targets = pos_gt_bboxes
            bbox_target[pos_inds, :] = pos_bbox_targets
            bbox_weight[pos_inds, :] = 1.0
            if not np.any(gt_labels):
                label[pos_inds] = 0
            else:
                label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]

            label_weight[pos_inds] = 1.0
        if len(neg_inds) > 0:
            label_weight[neg_inds] = 1.0

        pos_num = max(pos_inds.size, 1)

        return pos_num, label, label_weight, bbox_target


================================================
FILE: ppdet/modeling/assigners/task_aligned_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ..bbox_utils import batch_iou_similarity
from .utils import (gather_topk_anchors, check_points_inside_bboxes,
                    compute_max_iou_anchor)

__all__ = ['TaskAlignedAssigner']


def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
    """Calculate distance ratio of box1 and box2 in batch for larger stride
        anchors dist/stride to promote the survive of large distance match
    Args:
        anchor (Tensor): box with the shape [L, 2]
        gt (Tensor): box with the shape [N, M2, 4]
    Return:
        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
    """
    center1 = anchor.unsqueeze(0)
    center2 = (gt[..., :2] + gt[..., -2:]) / 2.
    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]
    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]

    stride = paddle.concat([
        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
    ]).unsqueeze(0).unsqueeze(0)
    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
    dist_ratio = dist
    dist_ratio[dist < max_dist] = 1.
    dist_ratio[dist >= max_dist] = 0.
    return dist_ratio


@register
class TaskAlignedAssigner(nn.Layer):
    """TOOD: Task-aligned One-stage Object Detection
    """

    def __init__(self,
                 topk=13,
                 alpha=1.0,
                 beta=6.0,
                 eps=1e-9,
                 is_close_gt=False):
        super(TaskAlignedAssigner, self).__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.eps = eps
        self.is_close_gt = is_close_gt

    @paddle.no_grad()
    def forward(self,
                pred_scores,
                pred_bboxes,
                anchor_points,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index,
                gt_segms=None,
                gt_scores=None):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
            num_anchors_list (List): num of anchors in each level, shape(L)
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 4)
            assigned_scores (Tensor): (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and \
               gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = paddle.full(
                [batch_size, num_anchors], bg_index, dtype='int32')
            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
            assigned_scores = paddle.zeros(
                [batch_size, num_anchors, num_classes])
            return assigned_labels, assigned_bboxes, assigned_scores

        # compute iou between gt and pred bbox, [B, n, L]
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
        # gather pred bboxes class score
        pred_scores = pred_scores.transpose([0, 2, 1])
        batch_ind = paddle.arange(
            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
        gt_labels_ind = paddle.stack(
            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
            axis=-1)
        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
            self.beta)

        # check the positive sample's center in gt, [B, n, L]
        if self.is_close_gt:
            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
        else:
            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

        # select topk largest alignment metrics pred bbox as candidates
        # for each gt, [B, n, L]
        is_in_topk = gather_topk_anchors(
            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts * pad_gt_mask

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(axis=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
                [1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                         mask_positive)
            mask_positive_sum = mask_positive.sum(axis=-2)
        assigned_gt_index = mask_positive.argmax(axis=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
        assigned_labels = paddle.gather(
            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = paddle.where(
            mask_positive_sum > 0, assigned_labels,
            paddle.full_like(assigned_labels, bg_index))

        assigned_bboxes = paddle.gather(
            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = paddle.index_select(
            assigned_scores, paddle.to_tensor(ind), axis=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
                                                           keepdim=True)
        alignment_metrics = alignment_metrics / (
            max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        if gt_segms is not None:
            return assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index
        else:
            return assigned_labels, assigned_bboxes, assigned_scores


================================================
FILE: ppdet/modeling/assigners/task_aligned_assigner_cr.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ..bbox_utils import batch_iou_similarity
from .utils import (gather_topk_anchors, check_points_inside_bboxes,
                    compute_max_iou_anchor)

__all__ = ['TaskAlignedAssigner_CR']


@register
class TaskAlignedAssigner_CR(nn.Layer):
    """TOOD: Task-aligned One-stage Object Detection with Center R
    """

    def __init__(self,
                 topk=13,
                 alpha=1.0,
                 beta=6.0,
                 center_radius=None,
                 eps=1e-9):
        super(TaskAlignedAssigner_CR, self).__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.center_radius = center_radius
        self.eps = eps

    @paddle.no_grad()
    def forward(self,
                pred_scores,
                pred_bboxes,
                anchor_points,
                stride_tensor,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index,
                gt_scores=None):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 4)
            assigned_scores (Tensor): (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and \
               gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = paddle.full(
                [batch_size, num_anchors], bg_index, dtype='int32')
            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
            assigned_scores = paddle.zeros(
                [batch_size, num_anchors, num_classes])
            return assigned_labels, assigned_bboxes, assigned_scores

        # compute iou between gt and pred bbox, [B, n, L]
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
        # gather pred bboxes class score
        pred_scores = pred_scores.transpose([0, 2, 1])
        batch_ind = paddle.arange(
            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
        gt_labels_ind = paddle.stack(
            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
            axis=-1)
        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
            self.beta) * pad_gt_mask

        # select positive sample, [B, n, L]
        if self.center_radius is None:
            # check the positive sample's center in gt, [B, n, L]
            is_in_gts = check_points_inside_bboxes(
                anchor_points, gt_bboxes, sm_use=True)
            # select topk largest alignment metrics pred bbox as candidates
            # for each gt, [B, n, L]
            mask_positive = gather_topk_anchors(
                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
        else:
            is_in_gts, is_in_center = check_points_inside_bboxes(
                anchor_points,
                gt_bboxes,
                stride_tensor * self.center_radius,
                sm_use=True)
            is_in_gts *= pad_gt_mask
            is_in_center *= pad_gt_mask
            candidate_metrics = paddle.where(
                is_in_gts.sum(-1, keepdim=True) == 0,
                alignment_metrics + is_in_center,
                alignment_metrics)
            mask_positive = gather_topk_anchors(
                candidate_metrics, self.topk,
                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
                                                     (is_in_gts > 0), 'float32')

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(axis=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
                [1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious * mask_positive)
            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                         mask_positive)
            mask_positive_sum = mask_positive.sum(axis=-2)
        assigned_gt_index = mask_positive.argmax(axis=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
        assigned_labels = paddle.gather(
            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = paddle.where(
            mask_positive_sum > 0, assigned_labels,
            paddle.full_like(assigned_labels, bg_index))

        assigned_bboxes = paddle.gather(
            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = paddle.index_select(
            assigned_scores, paddle.to_tensor(ind), axis=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
                                                           keepdim=True)
        alignment_metrics = alignment_metrics / (
            max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        return assigned_labels, assigned_bboxes, assigned_scores


================================================
FILE: ppdet/modeling/assigners/uniform_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

from ppdet.modeling.bbox_utils import batch_bbox_overlaps
from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh

__all__ = ['UniformAssigner']


def batch_p_dist(x, y, p=2):
    """
    calculate pairwise p_dist, the first index of x and y are batch
    return [x.shape[0], y.shape[0]]
    """
    x = x.unsqueeze(1)
    diff = x - y
    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))


@register
class UniformAssigner(nn.Layer):
    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
        super(UniformAssigner, self).__init__()
        self.pos_ignore_thr = pos_ignore_thr
        self.neg_ignore_thr = neg_ignore_thr
        self.match_times = match_times

    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
        num_bboxes = bbox_pred.shape[0]
        num_gts = gt_bboxes.shape[0]
        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)

        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
        pred_max_iou = pred_ious.max(axis=1)
        neg_ignore = pred_max_iou > self.neg_ignore_thr
        # exclude potential ignored neg samples first, deal with pos samples later
        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
        match_labels = paddle.where(neg_ignore,
                                    paddle.full_like(match_labels, -2),
                                    match_labels)

        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
        anchor_c = bbox_xyxy_to_cxcywh(anchor)
        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)

        top_pred = bbox_pred_dist.topk(
            k=self.match_times, axis=0, largest=False)[1]
        top_anchor = anchor_dist.topk(
            k=self.match_times, axis=0, largest=False)[1]

        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])

        pos_anchor = anchor[pos_places]
        pos_tar_bbox = gt_bboxes[pos_inds]
        pos_ious = batch_bbox_overlaps(
            pos_anchor, pos_tar_bbox, is_aligned=True)
        pos_ignore = pos_ious < self.pos_ignore_thr
        pos_inds = paddle.where(pos_ignore,
                                paddle.full_like(pos_inds, -2), pos_inds)
        match_labels[pos_places] = pos_inds
        match_labels.stop_gradient = True
        pos_keep = ~pos_ignore

        if pos_keep.sum() > 0:
            pos_places_keep = pos_places[pos_keep]
            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
        else:
            pos_bbox_pred = None
            pos_bbox_tar = None

        return match_labels, pos_bbox_pred, pos_bbox_tar


================================================
FILE: ppdet/modeling/assigners/utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn.functional as F

__all__ = [
    'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',
    'compute_max_iou_anchor', 'compute_max_iou_gt',
    'generate_anchors_for_grid_cell'
]


def pad_gt(gt_labels, gt_bboxes, gt_scores=None):
    r""" Pad 0 in gt_labels and gt_bboxes.
    Args:
        gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,
            shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)
        gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,
            shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
        gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,
            shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
    Returns:
        pad_gt_labels (Tensor, int64): shape[B, n, 1]
        pad_gt_bboxes (Tensor, float32): shape[B, n, 4]
        pad_gt_scores (Tensor, float32): shape[B, n, 1]
        pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox
    """
    if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,
                                                           paddle.Tensor):
        assert gt_labels.ndim == gt_bboxes.ndim and \
               gt_bboxes.ndim == 3
        pad_gt_mask = (
            gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)
        if gt_scores is None:
            gt_scores = pad_gt_mask.clone()
        assert gt_labels.ndim == gt_scores.ndim

        return gt_labels, gt_bboxes, gt_scores, pad_gt_mask
    elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):
        assert len(gt_labels) == len(gt_bboxes), \
            'The number of `gt_labels` and `gt_bboxes` is not equal. '
        num_max_boxes = max([len(a) for a in gt_bboxes])
        batch_size = len(gt_bboxes)
        # pad label and bbox
        pad_gt_labels = paddle.zeros(
            [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)
        pad_gt_bboxes = paddle.zeros(
            [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)
        pad_gt_scores = paddle.zeros(
            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
        pad_gt_mask = paddle.zeros(
            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
        for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):
            if len(label) > 0 and len(bbox) > 0:
                pad_gt_labels[i, :len(label)] = label
                pad_gt_bboxes[i, :len(bbox)] = bbox
                pad_gt_mask[i, :len(bbox)] = 1.
                if gt_scores is not None:
                    pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]
        if gt_scores is None:
            pad_gt_scores = pad_gt_mask.clone()
        return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask
    else:
        raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')


def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
    r"""
    Args:
        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
        topk (int): The number of top elements to look for along the axis.
        largest (bool) : largest is a flag, if set to true,
            algorithm will sort by descending order, otherwise sort by
            ascending order. Default: True
        topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
            Default: None
        eps (float): Default: 1e-9
    Returns:
        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_anchors = metrics.shape[-1]
    topk_metrics, topk_idxs = paddle.topk(
        metrics, topk, axis=-1, largest=largest)
    if topk_mask is None:
        topk_mask = (
            topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)
    is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
        axis=-2).astype(metrics.dtype)
    return is_in_topk * topk_mask


def check_points_inside_bboxes(points,
                               bboxes,
                               center_radius_tensor=None,
                               eps=1e-9,
                               sm_use=False):
    r"""
    Args:
        points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
        bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
        center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
        eps (float): Default: 1e-9
    Returns:
        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    points = points.unsqueeze([0, 1])
    x, y = points.chunk(2, axis=-1)
    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
    # check whether `points` is in `bboxes`
    l = x - xmin
    t = y - ymin
    r = xmax - x
    b = ymax - y
    delta_ltrb = paddle.concat([l, t, r, b], axis=-1)
    is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)
    if center_radius_tensor is not None:
        # check whether `points` is in `center_radius`
        center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])
        cx = (xmin + xmax) * 0.5
        cy = (ymin + ymax) * 0.5
        l = x - (cx - center_radius_tensor)
        t = y - (cy - center_radius_tensor)
        r = (cx + center_radius_tensor) - x
        b = (cy + center_radius_tensor) - y
        delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
        is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
        if sm_use:
            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
                bboxes.dtype)
        else:
            return (paddle.logical_and(is_in_bboxes, is_in_center),
                    paddle.logical_or(is_in_bboxes, is_in_center))

    return is_in_bboxes.astype(bboxes.dtype)


def compute_max_iou_anchor(ious):
    r"""
    For each anchor, find the GT with the largest IOU.
    Args:
        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
    Returns:
        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_max_boxes = ious.shape[-2]
    max_iou_index = ious.argmax(axis=-2)
    is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])
    return is_max_iou.astype(ious.dtype)


def compute_max_iou_gt(ious):
    r"""
    For each GT, find the anchor with the largest IOU.
    Args:
        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
    Returns:
        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_anchors = ious.shape[-1]
    max_iou_index = ious.argmax(axis=-1)
    is_max_iou = F.one_hot(max_iou_index, num_anchors)
    return is_max_iou.astype(ious.dtype)


def generate_anchors_for_grid_cell(feats,
                                   fpn_strides,
                                   grid_cell_size=5.0,
                                   grid_cell_offset=0.5,
                                   dtype='float32'):
    r"""
    Like ATSS, generate anchors based on grid size.
    Args:
        feats (List[Tensor]): shape[s, (b, c, h, w)]
        fpn_strides (tuple|list): shape[s], stride for each scale feature
        grid_cell_size (float): anchor size
        grid_cell_offset (float): The range is between 0 and 1.
    Returns:
        anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
        anchor_points (Tensor): shape[l, 2], "x, y" format.
        num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
        stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
    """
    assert len(feats) == len(fpn_strides)
    anchors = []
    anchor_points = []
    num_anchors_list = []
    stride_tensor = []
    for feat, stride in zip(feats, fpn_strides):
        _, _, h, w = feat.shape
        cell_half_size = grid_cell_size * stride * 0.5
        shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride
        shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride
        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
        anchor = paddle.stack(
            [
                shift_x - cell_half_size, shift_y - cell_half_size,
                shift_x + cell_half_size, shift_y + cell_half_size
            ],
            axis=-1).astype(dtype)
        anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)

        anchors.append(anchor.reshape([-1, 4]))
        anchor_points.append(anchor_point.reshape([-1, 2]))
        num_anchors_list.append(len(anchors[-1]))
        stride_tensor.append(
            paddle.full(
                [num_anchors_list[-1], 1], stride, dtype=dtype))
    anchors = paddle.concat(anchors)
    anchors.stop_gradient = True
    anchor_points = paddle.concat(anchor_points)
    anchor_points.stop_gradient = True
    stride_tensor = paddle.concat(stride_tensor)
    stride_tensor.stop_gradient = True
    return anchors, anchor_points, num_anchors_list, stride_tensor


================================================
FILE: ppdet/modeling/backbones/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import vgg
from . import resnet
from . import darknet
from . import mobilenet_v1
from . import mobilenet_v3
from . import hrnet
from . import lite_hrnet
from . import blazenet
from . import ghostnet
from . import senet
from . import res2net
from . import dla
from . import shufflenet_v2
from . import swin_transformer
from . import lcnet
from . import hardnet
from . import esnet
from . import cspresnet
from . import csp_darknet
from . import convnext
from . import vision_transformer
from . import mobileone
from . import trans_encoder
from . import focalnet
from . import vit_mae
from . import hgnet_v2
from . import clrnet_resnet

from .vgg import *
from .resnet import *
from .darknet import *
from .mobilenet_v1 import *
from .mobilenet_v3 import *
from .hrnet import *
from .lite_hrnet import *
from .blazenet import *
from .ghostnet import *
from .senet import *
from .res2net import *
from .dla import *
from .shufflenet_v2 import *
from .swin_transformer import *
from .lcnet import *
from .hardnet import *
from .esnet import *
from .cspresnet import *
from .csp_darknet import *
from .convnext import *
from .vision_transformer import *
from .mobileone import *
from .trans_encoder import *
from .focalnet import *
from .vitpose import *
from .vit_mae import *
from .hgnet_v2 import *
from .clrnet_resnet import *


================================================
FILE: ppdet/modeling/backbones/blazenet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['BlazeNet']


def hard_swish(x):
    return x * F.relu6(x + 3) / 6.


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 num_groups=1,
                 act='relu',
                 conv_lr=0.1,
                 conv_decay=0.,
                 norm_decay=0.,
                 norm_type='bn',
                 name=None):
        super(ConvBNLayer, self).__init__()
        self.act = act
        self._conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=num_groups,
            weight_attr=ParamAttr(
                learning_rate=conv_lr, initializer=KaimingNormal()),
            bias_attr=False)

        if norm_type in ['bn', 'sync_bn']:
            self._batch_norm = nn.BatchNorm2D(out_channels)

    def forward(self, x):
        x = self._conv(x)
        x = self._batch_norm(x)
        if self.act == "relu":
            x = F.relu(x)
        elif self.act == "relu6":
            x = F.relu6(x)
        elif self.act == 'leaky':
            x = F.leaky_relu(x)
        elif self.act == 'hard_swish':
            x = hard_swish(x)
        return x


class BlazeBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels1,
                 out_channels2,
                 double_channels=None,
                 stride=1,
                 use_5x5kernel=True,
                 act='relu',
                 name=None):
        super(BlazeBlock, self).__init__()
        assert stride in [1, 2]
        self.use_pool = not stride == 1
        self.use_double_block = double_channels is not None
        self.conv_dw = []
        if use_5x5kernel:
            self.conv_dw.append(
                self.add_sublayer(
                    name + "1_dw",
                    ConvBNLayer(
                        in_channels=in_channels,
                        out_channels=out_channels1,
                        kernel_size=5,
                        stride=stride,
                        padding=2,
                        num_groups=out_channels1,
                        name=name + "1_dw")))
        else:
            self.conv_dw.append(
                self.add_sublayer(
                    name + "1_dw_1",
                    ConvBNLayer(
                        in_channels=in_channels,
                        out_channels=out_channels1,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        num_groups=out_channels1,
                        name=name + "1_dw_1")))
            self.conv_dw.append(
                self.add_sublayer(
                    name + "1_dw_2",
                    ConvBNLayer(
                        in_channels=out_channels1,
                        out_channels=out_channels1,
                        kernel_size=3,
                        stride=stride,
                        padding=1,
                        num_groups=out_channels1,
                        name=name + "1_dw_2")))
        self.act = act if self.use_double_block else None
        self.conv_pw = ConvBNLayer(
            in_channels=out_channels1,
            out_channels=out_channels2,
            kernel_size=1,
            stride=1,
            padding=0,
            act=self.act,
            name=name + "1_sep")
        if self.use_double_block:
            self.conv_dw2 = []
            if use_5x5kernel:
                self.conv_dw2.append(
                    self.add_sublayer(
                        name + "2_dw",
                        ConvBNLayer(
                            in_channels=out_channels2,
                            out_channels=out_channels2,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            num_groups=out_channels2,
                            name=name + "2_dw")))
            else:
                self.conv_dw2.append(
                    self.add_sublayer(
                        name + "2_dw_1",
                        ConvBNLayer(
                            in_channels=out_channels2,
                            out_channels=out_channels2,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            num_groups=out_channels2,
                            name=name + "1_dw_1")))
                self.conv_dw2.append(
                    self.add_sublayer(
                        name + "2_dw_2",
                        ConvBNLayer(
                            in_channels=out_channels2,
                            out_channels=out_channels2,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            num_groups=out_channels2,
                            name=name + "2_dw_2")))
            self.conv_pw2 = ConvBNLayer(
                in_channels=out_channels2,
                out_channels=double_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                name=name + "2_sep")
        # shortcut
        if self.use_pool:
            shortcut_channel = double_channels or out_channels2
            self._shortcut = []
            self._shortcut.append(
                self.add_sublayer(
                    name + '_shortcut_pool',
                    nn.MaxPool2D(
                        kernel_size=stride, stride=stride, ceil_mode=True)))
            self._shortcut.append(
                self.add_sublayer(
                    name + '_shortcut_conv',
                    ConvBNLayer(
                        in_channels=in_channels,
                        out_channels=shortcut_channel,
                        kernel_size=1,
                        stride=1,
                        padding=0,
                        name="shortcut" + name)))

    def forward(self, x):
        y = x
        for conv_dw_block in self.conv_dw:
            y = conv_dw_block(y)
        y = self.conv_pw(y)
        if self.use_double_block:
            for conv_dw2_block in self.conv_dw2:
                y = conv_dw2_block(y)
            y = self.conv_pw2(y)
        if self.use_pool:
            for shortcut in self._shortcut:
                x = shortcut(x)
        return F.relu(paddle.add(x, y))


@register
@serializable
class BlazeNet(nn.Layer):
    """
    BlazeFace, see https://arxiv.org/abs/1907.05047

    Args:
        blaze_filters (list): number of filter for each blaze block.
        double_blaze_filters (list): number of filter for each double_blaze block.
        use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv.
    """

    def __init__(
            self,
            blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]],
            double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],
                                  [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]],
            use_5x5kernel=True,
            act=None):
        super(BlazeNet, self).__init__()
        conv1_num_filters = blaze_filters[0][0]
        self.conv1 = ConvBNLayer(
            in_channels=3,
            out_channels=conv1_num_filters,
            kernel_size=3,
            stride=2,
            padding=1,
            name="conv1")
        in_channels = conv1_num_filters
        self.blaze_block = []
        self._out_channels = []
        for k, v in enumerate(blaze_filters):
            assert len(v) in [2, 3], \
                "blaze_filters {} not in [2, 3]"
            if len(v) == 2:
                self.blaze_block.append(
                    self.add_sublayer(
                        'blaze_{}'.format(k),
                        BlazeBlock(
                            in_channels,
                            v[0],
                            v[1],
                            use_5x5kernel=use_5x5kernel,
                            act=act,
                            name='blaze_{}'.format(k))))
            elif len(v) == 3:
                self.blaze_block.append(
                    self.add_sublayer(
                        'blaze_{}'.format(k),
                        BlazeBlock(
                            in_channels,
                            v[0],
                            v[1],
                            stride=v[2],
                            use_5x5kernel=use_5x5kernel,
                            act=act,
                            name='blaze_{}'.format(k))))
            in_channels = v[1]

        for k, v in enumerate(double_blaze_filters):
            assert len(v) in [3, 4], \
                "blaze_filters {} not in [3, 4]"
            if len(v) == 3:
                self.blaze_block.append(
                    self.add_sublayer(
                        'double_blaze_{}'.format(k),
                        BlazeBlock(
                            in_channels,
                            v[0],
                            v[1],
                            double_channels=v[2],
                            use_5x5kernel=use_5x5kernel,
                            act=act,
                            name='double_blaze_{}'.format(k))))
            elif len(v) == 4:
                self.blaze_block.append(
                    self.add_sublayer(
                        'double_blaze_{}'.format(k),
                        BlazeBlock(
                            in_channels,
                            v[0],
                            v[1],
                            double_channels=v[2],
                            stride=v[3],
                            use_5x5kernel=use_5x5kernel,
                            act=act,
                            name='double_blaze_{}'.format(k))))
            in_channels = v[2]
            self._out_channels.append(in_channels)

    def forward(self, inputs):
        outs = []
        y = self.conv1(inputs['image'])
        for block in self.blaze_block:
            y = block(y)
            outs.append(y)
        return [outs[-4], outs[-1]]

    @property
    def out_shape(self):
        return [
            ShapeSpec(channels=c)
            for c in [self._out_channels[-4], self._out_channels[-1]]
        ]


================================================
FILE: ppdet/modeling/backbones/clrnet_resnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn

from paddle.utils.download import get_weights_path_from_url
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['CLRResNet']

model_urls = {
    'resnet18':
    'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams',
    'resnet34':
    'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams',
    'resnet50':
    'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams',
    'resnet101':
    'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams',
    'resnet152':
    'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams',
    'resnext50_32x4d':
    'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams',
    'resnext101_32x8d':
    'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams',
    'wide_resnet50_2':
    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams',
    'wide_resnet101_2':
    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams',
}


class BasicBlock(nn.Layer):
    expansion = 1

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 groups=1,
                 base_width=64,
                 dilation=1,
                 norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2D

        if dilation > 1:
            raise NotImplementedError(
                "Dilation > 1 not supported in BasicBlock")

        self.conv1 = nn.Conv2D(
            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class BottleneckBlock(nn.Layer):

    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 groups=1,
                 base_width=64,
                 dilation=1,
                 norm_layer=None):
        super(BottleneckBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2D
        width = int(planes * (base_width / 64.)) * groups

        self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
        self.bn1 = norm_layer(width)

        self.conv2 = nn.Conv2D(
            width,
            width,
            3,
            padding=dilation,
            stride=stride,
            groups=groups,
            dilation=dilation,
            bias_attr=False)
        self.bn2 = norm_layer(width)

        self.conv3 = nn.Conv2D(
            width, planes * self.expansion, 1, bias_attr=False)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU()
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Layer):
    """ResNet model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        Block (BasicBlock|BottleneckBlock): Block module of model.
        depth (int, optional): Layers of ResNet, Default: 50.
        width (int, optional): Base width per convolution group for each convolution block, Default: 64.
        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
                            will not be defined. Default: 1000.
        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
        groups (int, optional): Number of groups for each convolution block, Default: 1.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import ResNet
            from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
            # build ResNet with 18 layers
            resnet18 = ResNet(BasicBlock, 18)
            # build ResNet with 50 layers
            resnet50 = ResNet(BottleneckBlock, 50)
            # build Wide ResNet model
            wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)
            # build ResNeXt model
            resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)
            x = paddle.rand([1, 3, 224, 224])
            out = resnet18(x)
            print(out.shape)
            # [1, 1000]
    """

    def __init__(self, block, depth=50, width=64, with_pool=True, groups=1):
        super(ResNet, self).__init__()
        layer_cfg = {
            18: [2, 2, 2, 2],
            34: [3, 4, 6, 3],
            50: [3, 4, 6, 3],
            101: [3, 4, 23, 3],
            152: [3, 8, 36, 3]
        }

        layers = layer_cfg[depth]
        self.groups = groups
        self.base_width = width
        self.with_pool = with_pool
        self._norm_layer = nn.BatchNorm2D

        self.inplanes = 64
        self.dilation = 1

        self.conv1 = nn.Conv2D(
            3,
            self.inplanes,
            kernel_size=7,
            stride=2,
            padding=3,
            bias_attr=False)
        self.bn1 = self._norm_layer(self.inplanes)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        if with_pool:
            self.avgpool = nn.AdaptiveAvgPool2D((1, 1))

        ch_out_list = [64, 128, 256, 512]
        block = BottleneckBlock if depth >= 50 else BasicBlock

        self._out_channels = [block.expansion * v for v in ch_out_list]
        self._out_strides = [4, 8, 16, 32]
        self.return_idx = [0, 1, 2, 3]

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2D(
                    self.inplanes,
                    planes * block.expansion,
                    1,
                    stride=stride,
                    bias_attr=False),
                norm_layer(planes * block.expansion), )

        layers = []
        layers.append(
            block(self.inplanes, planes, stride, downsample, self.groups,
                  self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    norm_layer=norm_layer))

        return nn.Sequential(*layers)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        out_layers = []
        x = self.layer1(x)
        out_layers.append(x)
        x = self.layer2(x)
        out_layers.append(x)
        x = self.layer3(x)
        out_layers.append(x)
        x = self.layer4(x)
        out_layers.append(x)

        if self.with_pool:
            x = self.avgpool(x)

        return out_layers


@register
@serializable
class CLRResNet(nn.Layer):
    def __init__(self,
                 resnet='resnet18',
                 pretrained=True,
                 out_conv=False,
                 fea_stride=8,
                 out_channel=128,
                 in_channels=[64, 128, 256, 512],
                 cfg=None):
        super(CLRResNet, self).__init__()
        self.cfg = cfg
        self.in_channels = in_channels

        self.model = eval(resnet)(pretrained=pretrained)
        self.out = None
        if out_conv:
            out_channel = 512
            for chan in reversed(self.in_channels):
                if chan < 0: continue
                out_channel = chan
                break
            self.out = nn.Conv2D(
                out_channel * self.model.expansion,
                cfg.featuremap_out_channel,
                kernel_size=1,
                bias_attr=False)

    @property
    def out_shape(self):
        return self.model.out_shape

    def forward(self, x):
        x = self.model(x)
        if self.out:
            x[-1] = self.out(x[-1])
        return x


def _resnet(arch, Block, depth, pretrained, **kwargs):
    model = ResNet(Block, depth, **kwargs)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
            arch)
        weight_path = get_weights_path_from_url(model_urls[arch])

        param = paddle.load(weight_path)
        model.set_dict(param)

    return model


def resnet18(pretrained=False, **kwargs):
    """ResNet 18-layer model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnet18
            # build model
            model = resnet18()
            # build model and load imagenet pretrained weight
            # model = resnet18(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)


def resnet34(pretrained=False, **kwargs):
    """ResNet 34-layer model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnet34
            # build model
            model = resnet34()
            # build model and load imagenet pretrained weight
            # model = resnet34(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)


def resnet50(pretrained=False, **kwargs):
    """ResNet 50-layer model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnet50
            # build model
            model = resnet50()
            # build model and load imagenet pretrained weight
            # model = resnet50(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)


def resnet101(pretrained=False, **kwargs):
    """ResNet 101-layer model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnet101
            # build model
            model = resnet101()
            # build model and load imagenet pretrained weight
            # model = resnet101(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)


def resnet152(pretrained=False, **kwargs):
    """ResNet 152-layer model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnet152
            # build model
            model = resnet152()
            # build model and load imagenet pretrained weight
            # model = resnet152(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)


def resnext50_32x4d(pretrained=False, **kwargs):
    """ResNeXt-50 32x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext50_32x4d
            # build model
            model = resnext50_32x4d()
            # build model and load imagenet pretrained weight
            # model = resnext50_32x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 32
    kwargs['width'] = 4
    return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs)


def resnext50_64x4d(pretrained=False, **kwargs):
    """ResNeXt-50 64x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext50_64x4d
            # build model
            model = resnext50_64x4d()
            # build model and load imagenet pretrained weight
            # model = resnext50_64x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 64
    kwargs['width'] = 4
    return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs)


def resnext101_32x4d(pretrained=False, **kwargs):
    """ResNeXt-101 32x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext101_32x4d
            # build model
            model = resnext101_32x4d()
            # build model and load imagenet pretrained weight
            # model = resnext101_32x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 32
    kwargs['width'] = 4
    return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained,
                   **kwargs)


def resnext101_64x4d(pretrained=False, **kwargs):
    """ResNeXt-101 64x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext101_64x4d
            # build model
            model = resnext101_64x4d()
            # build model and load imagenet pretrained weight
            # model = resnext101_64x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 64
    kwargs['width'] = 4
    return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained,
                   **kwargs)


def resnext152_32x4d(pretrained=False, **kwargs):
    """ResNeXt-152 32x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext152_32x4d
            # build model
            model = resnext152_32x4d()
            # build model and load imagenet pretrained weight
            # model = resnext152_32x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 32
    kwargs['width'] = 4
    return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained,
                   **kwargs)


def resnext152_64x4d(pretrained=False, **kwargs):
    """ResNeXt-152 64x4d model from
    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
    
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import resnext152_64x4d
            # build model
            model = resnext152_64x4d()
            # build model and load imagenet pretrained weight
            # model = resnext152_64x4d(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['groups'] = 64
    kwargs['width'] = 4
    return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained,
                   **kwargs)


def wide_resnet50_2(pretrained=False, **kwargs):
    """Wide ResNet-50-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import wide_resnet50_2
            # build model
            model = wide_resnet50_2()
            # build model and load imagenet pretrained weight
            # model = wide_resnet50_2(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['width'] = 64 * 2
    return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs)


def wide_resnet101_2(pretrained=False, **kwargs):
    """Wide ResNet-101-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
    Args:
        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                            on ImageNet. Default: False.
        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
    Returns:
        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.models import wide_resnet101_2
            # build model
            model = wide_resnet101_2()
            # build model and load imagenet pretrained weight
            # model = wide_resnet101_2(pretrained=True)
            x = paddle.rand([1, 3, 224, 224])
            out = model(x)
            print(out.shape)
            # [1, 1000]
    """
    kwargs['width'] = 64 * 2
    return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained,
                   **kwargs)


================================================
FILE: ppdet/modeling/backbones/convnext.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
'''
Modified from https://github.com/facebookresearch/ConvNeXt
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
'''

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant

import numpy as np

from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .transformer_utils import DropPath, trunc_normal_, zeros_

__all__ = ['ConvNeXt']


class Block(nn.Layer):
    r""" ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in Pypaddle
    
    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
    """

    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
        super().__init__()
        self.dwconv = nn.Conv2D(
            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(
            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * dim, dim)

        if layer_scale_init_value > 0:
            self.gamma = self.create_parameter(
                shape=(dim, ),
                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
        else:
            self.gamma = None

        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
        )

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = x.transpose([0, 2, 3, 1])
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.transpose([0, 3, 1, 2])
        x = input + self.drop_path(x)
        return x


class LayerNorm(nn.Layer):
    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
    with shape (batch_size, channels, height, width).
    """

    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()

        self.weight = self.create_parameter(
            shape=(normalized_shape, ),
            attr=ParamAttr(initializer=Constant(1.)))
        self.bias = self.create_parameter(
            shape=(normalized_shape, ),
            attr=ParamAttr(initializer=Constant(0.)))

        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError
        self.normalized_shape = (normalized_shape, )

    def forward(self, x):
        if self.data_format == "channels_last":
            return F.layer_norm(x, self.normalized_shape, self.weight,
                                self.bias, self.eps)
        elif self.data_format == "channels_first":
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / paddle.sqrt(s + self.eps)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
            return x


@register
@serializable
class ConvNeXt(nn.Layer):
    r""" ConvNeXt
        A Pypaddle impl of : `A ConvNet for the 2020s`  -
          https://arxiv.org/pdf/2201.03545.pdf

    Args:
        in_chans (int): Number of input image channels. Default: 3
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
    """

    arch_settings = {
        'tiny': {
            'depths': [3, 3, 9, 3],
            'dims': [96, 192, 384, 768]
        },
        'small': {
            'depths': [3, 3, 27, 3],
            'dims': [96, 192, 384, 768]
        },
        'base': {
            'depths': [3, 3, 27, 3],
            'dims': [128, 256, 512, 1024]
        },
        'large': {
            'depths': [3, 3, 27, 3],
            'dims': [192, 384, 768, 1536]
        },
        'xlarge': {
            'depths': [3, 3, 27, 3],
            'dims': [256, 512, 1024, 2048]
        },
    }

    def __init__(
            self,
            arch='tiny',
            in_chans=3,
            drop_path_rate=0.,
            layer_scale_init_value=1e-6,
            return_idx=[1, 2, 3],
            norm_output=True,
            pretrained=None, ):
        super().__init__()
        depths = self.arch_settings[arch]['depths']
        dims = self.arch_settings[arch]['dims']
        self.downsample_layers = nn.LayerList(
        )  # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2D(
                in_chans, dims[0], kernel_size=4, stride=4),
            LayerNorm(
                dims[0], eps=1e-6, data_format="channels_first"))
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                LayerNorm(
                    dims[i], eps=1e-6, data_format="channels_first"),
                nn.Conv2D(
                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.LayerList(
        )  # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
        cur = 0
        for i in range(4):
            stage = nn.Sequential(* [
                Block(
                    dim=dims[i],
                    drop_path=dp_rates[cur + j],
                    layer_scale_init_value=layer_scale_init_value)
                for j in range(depths[i])
            ])
            self.stages.append(stage)
            cur += depths[i]

        self.return_idx = return_idx
        self.dims = [dims[i] for i in return_idx]  # [::-1]

        self.norm_output = norm_output
        if norm_output:
            self.norms = nn.LayerList([
                LayerNorm(
                    c, eps=1e-6, data_format="channels_first")
                for c in self.dims
            ])

        self.apply(self._init_weights)

        if pretrained is not None:
            if 'http' in pretrained:  #URL
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:  #model in local path
                path = pretrained
            self.set_state_dict(paddle.load(path))

    def _init_weights(self, m):
        if isinstance(m, (nn.Conv2D, nn.Linear)):
            trunc_normal_(m.weight)
            zeros_(m.bias)

    def forward_features(self, x):
        output = []
        for i in range(4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
            output.append(x)

        outputs = [output[i] for i in self.return_idx]
        if self.norm_output:
            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]

        return outputs

    def forward(self, x):
        x = self.forward_features(x['image'])
        return x

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self.dims]


================================================
FILE: ppdet/modeling/backbones/csp_darknet.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.initializer import conv_init_
from ..shape_spec import ShapeSpec

__all__ = [
    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
]


class BaseConv(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize,
                 stride,
                 groups=1,
                 bias=False,
                 act="silu"):
        super(BaseConv, self).__init__()
        self.conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=ksize,
            stride=stride,
            padding=(ksize - 1) // 2,
            groups=groups,
            bias_attr=bias)
        self.bn = nn.BatchNorm2D(
            out_channels,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        self._init_weights()

    def _init_weights(self):
        conv_init_(self.conv)

    def forward(self, x):
        # use 'x * F.sigmoid(x)' replace 'silu'
        x = self.bn(self.conv(x))
        y = x * F.sigmoid(x)
        return y


class DWConv(nn.Layer):
    """Depthwise Conv"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize,
                 stride=1,
                 bias=False,
                 act="silu"):
        super(DWConv, self).__init__()
        self.dw_conv = BaseConv(
            in_channels,
            in_channels,
            ksize=ksize,
            stride=stride,
            groups=in_channels,
            bias=bias,
            act=act)
        self.pw_conv = BaseConv(
            in_channels,
            out_channels,
            ksize=1,
            stride=1,
            groups=1,
            bias=bias,
            act=act)

    def forward(self, x):
        return self.pw_conv(self.dw_conv(x))


class Focus(nn.Layer):
    """Focus width and height information into channel space, used in YOLOX."""

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=3,
                 stride=1,
                 bias=False,
                 act="silu"):
        super(Focus, self).__init__()
        self.conv = BaseConv(
            in_channels * 4,
            out_channels,
            ksize=ksize,
            stride=stride,
            bias=bias,
            act=act)

    def forward(self, inputs):
        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
        top_left = inputs[:, :, 0::2, 0::2]
        top_right = inputs[:, :, 0::2, 1::2]
        bottom_left = inputs[:, :, 1::2, 0::2]
        bottom_right = inputs[:, :, 1::2, 1::2]
        outputs = paddle.concat(
            [top_left, bottom_left, top_right, bottom_right], 1)
        return self.conv(outputs)


class BottleNeck(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 shortcut=True,
                 expansion=0.5,
                 depthwise=False,
                 bias=False,
                 act="silu"):
        super(BottleNeck, self).__init__()
        hidden_channels = int(out_channels * expansion)
        Conv = DWConv if depthwise else BaseConv
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.conv2 = Conv(
            hidden_channels,
            out_channels,
            ksize=3,
            stride=1,
            bias=bias,
            act=act)
        self.add_shortcut = shortcut and in_channels == out_channels

    def forward(self, x):
        y = self.conv2(self.conv1(x))
        if self.add_shortcut:
            y = y + x
        return y


class SPPLayer(nn.Layer):
    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_sizes=(5, 9, 13),
                 bias=False,
                 act="silu"):
        super(SPPLayer, self).__init__()
        hidden_channels = in_channels // 2
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.maxpoolings = nn.LayerList([
            nn.MaxPool2D(
                kernel_size=ks, stride=1, padding=ks // 2)
            for ks in kernel_sizes
        ])
        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
        self.conv2 = BaseConv(
            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)

    def forward(self, x):
        x = self.conv1(x)
        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
        x = self.conv2(x)
        return x


class SPPFLayer(nn.Layer):
    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
        equivalent to SPP(k=(5, 9, 13))
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=5,
                 bias=False,
                 act='silu'):
        super(SPPFLayer, self).__init__()
        hidden_channels = in_channels // 2
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.maxpooling = nn.MaxPool2D(
            kernel_size=ksize, stride=1, padding=ksize // 2)
        conv2_channels = hidden_channels * 4
        self.conv2 = BaseConv(
            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)

    def forward(self, x):
        x = self.conv1(x)
        y1 = self.maxpooling(x)
        y2 = self.maxpooling(y1)
        y3 = self.maxpooling(y2)
        concats = paddle.concat([x, y1, y2, y3], axis=1)
        out = self.conv2(concats)
        return out


class CSPLayer(nn.Layer):
    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 num_blocks=1,
                 shortcut=True,
                 expansion=0.5,
                 depthwise=False,
                 bias=False,
                 act="silu"):
        super(CSPLayer, self).__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.conv2 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.bottlenecks = nn.Sequential(* [
            BottleNeck(
                hidden_channels,
                hidden_channels,
                shortcut=shortcut,
                expansion=1.0,
                depthwise=depthwise,
                bias=bias,
                act=act) for _ in range(num_blocks)
        ])
        self.conv3 = BaseConv(
            hidden_channels * 2,
            out_channels,
            ksize=1,
            stride=1,
            bias=bias,
            act=act)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = paddle.concat([x_1, x_2], axis=1)
        x = self.conv3(x)
        return x


@register
@serializable
class CSPDarkNet(nn.Layer):
    """
    CSPDarkNet backbone.
    Args:
        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
        depth_mult (float): Depth multiplier, multiply number of channels in
            each layer, default as 1.0.
        width_mult (float): Width multiplier, multiply number of blocks in
            CSPLayer, default as 1.0.
        depthwise (bool): Whether to use depth-wise conv layer.
        act (str): Activation function type, default as 'silu'.
        return_idx (list): Index of stages whose feature maps are returned.
    """

    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']

    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
    arch_settings = {
        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
               [256, 512, 9, True, False], [512, 768, 3, True, False],
               [768, 1024, 3, True, True]],
    }

    def __init__(self,
                 arch='X',
                 depth_mult=1.0,
                 width_mult=1.0,
                 depthwise=False,
                 act='silu',
                 trt=False,
                 return_idx=[2, 3, 4]):
        super(CSPDarkNet, self).__init__()
        self.arch = arch
        self.return_idx = return_idx
        Conv = DWConv if depthwise else BaseConv
        arch_setting = self.arch_settings[arch]
        base_channels = int(arch_setting[0][0] * width_mult)

        # Note: differences between the latest YOLOv5 and the original YOLOX
        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
        if arch in ['P5', 'P6']:
            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
            self.stem = Conv(
                3, base_channels, ksize=6, stride=2, bias=False, act=act)
            spp_kernal_sizes = 5
        elif arch in ['X']:
            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
            self.stem = Focus(
                3, base_channels, ksize=3, stride=1, bias=False, act=act)
            spp_kernal_sizes = (5, 9, 13)
        else:
            raise AttributeError("Unsupported arch type: {}".format(arch))

        _out_channels = [base_channels]
        layers_num = 1
        self.csp_dark_blocks = []

        for i, (in_channels, out_channels, num_blocks, shortcut,
                use_spp) in enumerate(arch_setting):
            in_channels = int(in_channels * width_mult)
            out_channels = int(out_channels * width_mult)
            _out_channels.append(out_channels)
            num_blocks = max(round(num_blocks * depth_mult), 1)
            stage = []

            conv_layer = self.add_sublayer(
                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
                Conv(
                    in_channels, out_channels, 3, 2, bias=False, act=act))
            stage.append(conv_layer)
            layers_num += 1

            if use_spp and arch in ['X']:
                # in YOLOX use SPPLayer
                spp_layer = self.add_sublayer(
                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
                    SPPLayer(
                        out_channels,
                        out_channels,
                        kernel_sizes=spp_kernal_sizes,
                        bias=False,
                        act=act))
                stage.append(spp_layer)
                layers_num += 1

            csp_layer = self.add_sublayer(
                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
                CSPLayer(
                    out_channels,
                    out_channels,
                    num_blocks=num_blocks,
                    shortcut=shortcut,
                    depthwise=depthwise,
                    bias=False,
                    act=act))
            stage.append(csp_layer)
            layers_num += 1

            if use_spp and arch in ['P5', 'P6']:
                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
                sppf_layer = self.add_sublayer(
                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
                    SPPFLayer(
                        out_channels,
                        out_channels,
                        ksize=5,
                        bias=False,
                        act=act))
                stage.append(sppf_layer)
                layers_num += 1

            self.csp_dark_blocks.append(nn.Sequential(*stage))

        self._out_channels = [_out_channels[i] for i in self.return_idx]
        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]

    def forward(self, inputs):
        x = inputs['image']
        outputs = []
        x = self.stem(x)
        for i, layer in enumerate(self.csp_dark_blocks):
            x = layer(x)
            if i + 1 in self.return_idx:
                outputs.append(x)
        return outputs

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=c, stride=s)
            for c, s in zip(self._out_channels, self.strides)
        ]


================================================
FILE: ppdet/modeling/backbones/cspresnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant

from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=1,
                 groups=1,
                 padding=0,
                 act=None):
        super(ConvBNLayer, self).__init__()

        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias_attr=False)

        self.bn = nn.BatchNorm2D(
            ch_out,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self.act = get_act_fn(act) if act is None or isinstance(act, (
            str, dict)) else act

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)

        return x


class RepVggBlock(nn.Layer):
    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
        super(RepVggBlock, self).__init__()
        self.ch_in = ch_in
        self.ch_out = ch_out
        self.conv1 = ConvBNLayer(
            ch_in, ch_out, 3, stride=1, padding=1, act=None)
        self.conv2 = ConvBNLayer(
            ch_in, ch_out, 1, stride=1, padding=0, act=None)
        self.act = get_act_fn(act) if act is None or isinstance(act, (
            str, dict)) else act
        if alpha:
            self.alpha = self.create_parameter(
                shape=[1],
                attr=ParamAttr(initializer=Constant(value=1.)),
                dtype="float32")
        else:
            self.alpha = None

    def forward(self, x):
        if hasattr(self, 'conv'):
            y = self.conv(x)
        else:
            if self.alpha:
                y = self.conv1(x) + self.alpha * self.conv2(x)
            else:
                y = self.conv1(x) + self.conv2(x)
        y = self.act(y)
        return y

    def convert_to_deploy(self):
        if not hasattr(self, 'conv'):
            self.conv = nn.Conv2D(
                in_channels=self.ch_in,
                out_channels=self.ch_out,
                kernel_size=3,
                stride=1,
                padding=1,
                groups=1)
        kernel, bias = self.get_equivalent_kernel_bias()
        self.conv.weight.set_value(kernel)
        self.conv.bias.set_value(bias)
        self.__delattr__('conv1')
        self.__delattr__('conv2')

    def get_equivalent_kernel_bias(self):
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
        if self.alpha:
            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
                kernel1x1), bias3x3 + self.alpha * bias1x1
        else:
            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
                kernel1x1), bias3x3 + bias1x1

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        else:
            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        if branch is None:
            return 0, 0
        kernel = branch.conv.weight
        running_mean = branch.bn._mean
        running_var = branch.bn._variance
        gamma = branch.bn.weight
        beta = branch.bn.bias
        eps = branch.bn._epsilon
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape((-1, 1, 1, 1))
        return kernel * t, beta - running_mean * gamma / std


class BasicBlock(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 act='relu',
                 shortcut=True,
                 use_alpha=False):
        super(BasicBlock, self).__init__()
        assert ch_in == ch_out
        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
        self.shortcut = shortcut

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        if self.shortcut:
            return paddle.add(x, y)
        else:
            return y


class EffectiveSELayer(nn.Layer):
    """ Effective Squeeze-Excitation
    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
    """

    def __init__(self, channels, act='hardsigmoid'):
        super(EffectiveSELayer, self).__init__()
        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
        self.act = get_act_fn(act) if act is None or isinstance(act, (
            str, dict)) else act

    def forward(self, x):
        x_se = x.mean((2, 3), keepdim=True)
        x_se = self.fc(x_se)
        return x * self.act(x_se)


class CSPResStage(nn.Layer):
    def __init__(self,
                 block_fn,
                 ch_in,
                 ch_out,
                 n,
                 stride,
                 act='relu',
                 attn='eca',
                 use_alpha=False):
        super(CSPResStage, self).__init__()

        ch_mid = (ch_in + ch_out) // 2
        if stride == 2:
            self.conv_down = ConvBNLayer(
                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
        else:
            self.conv_down = None
        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
        self.blocks = nn.Sequential(*[
            block_fn(
                ch_mid // 2,
                ch_mid // 2,
                act=act,
                shortcut=True,
                use_alpha=use_alpha) for i in range(n)
        ])
        if attn:
            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
        else:
            self.attn = None

        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)

    def forward(self, x):
        if self.conv_down is not None:
            x = self.conv_down(x)
        y1 = self.conv1(x)
        y2 = self.blocks(self.conv2(x))
        y = paddle.concat([y1, y2], axis=1)
        if self.attn is not None:
            y = self.attn(y)
        y = self.conv3(y)
        return y


@register
@serializable
class CSPResNet(nn.Layer):
    __shared__ = ['width_mult', 'depth_mult', 'trt']

    def __init__(self,
                 layers=[3, 6, 6, 3],
                 channels=[64, 128, 256, 512, 1024],
                 act='swish',
                 return_idx=[1, 2, 3],
                 depth_wise=False,
                 use_large_stem=False,
                 width_mult=1.0,
                 depth_mult=1.0,
                 trt=False,
                 use_checkpoint=False,
                 use_alpha=False,
                 **args):
        super(CSPResNet, self).__init__()
        self.use_checkpoint = use_checkpoint
        channels = [max(round(c * width_mult), 1) for c in channels]
        layers = [max(round(l * depth_mult), 1) for l in layers]
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act

        if use_large_stem:
            self.stem = nn.Sequential(
                ('conv1', ConvBNLayer(
                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
                ('conv2', ConvBNLayer(
                    channels[0] // 2,
                    channels[0] // 2,
                    3,
                    stride=1,
                    padding=1,
                    act=act)), ('conv3', ConvBNLayer(
                        channels[0] // 2,
                        channels[0],
                        3,
                        stride=1,
                        padding=1,
                        act=act)))
        else:
            self.stem = nn.Sequential(
                ('conv1', ConvBNLayer(
                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
                ('conv2', ConvBNLayer(
                    channels[0] // 2,
                    channels[0],
                    3,
                    stride=1,
                    padding=1,
                    act=act)))

        n = len(channels) - 1
        self.stages = nn.Sequential(*[(str(i), CSPResStage(
            BasicBlock,
            channels[i],
            channels[i + 1],
            layers[i],
            2,
            act=act,
            use_alpha=use_alpha)) for i in range(n)])

        self._out_channels = channels[1:]
        self._out_strides = [4 * 2**i for i in range(n)]
        self.return_idx = return_idx
        if use_checkpoint:
            paddle.seed(0)

    def forward(self, inputs):
        x = inputs['image']
        x = self.stem(x)
        outs = []
        for idx, stage in enumerate(self.stages):
            if self.use_checkpoint and self.training:
                x = paddle.distributed.fleet.utils.recompute(
                    stage, x, **{"preserve_rng_state": True})
            else:
                x = stage(x)
            if idx in self.return_idx:
                outs.append(x)

        return outs

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]


================================================
FILE: ppdet/modeling/backbones/darknet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import batch_norm, mish
from ..shape_spec import ShapeSpec

__all__ = ['DarkNet', 'ConvBNLayer']


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=1,
                 groups=1,
                 padding=0,
                 norm_type='bn',
                 norm_decay=0.,
                 act="leaky",
                 freeze_norm=False,
                 data_format='NCHW',
                 name=''):
        """
        conv + bn + activation layer

        Args:
            ch_in (int): input channel
            ch_out (int): output channel
            filter_size (int): filter size, default 3
            stride (int): stride, default 1
            groups (int): number of groups of conv layer, default 1
            padding (int): padding size, default 0
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            act (str): activation function type, default 'leaky', which means leaky_relu
            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """
        super(ConvBNLayer, self).__init__()

        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
            groups=groups,
            data_format=data_format,
            bias_attr=False)
        self.batch_norm = batch_norm(
            ch_out,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)
        self.act = act

    def forward(self, inputs):
        out = self.conv(inputs)
        out = self.batch_norm(out)
        if self.act == 'leaky':
            out = F.leaky_relu(out, 0.1)
        else:
            out = getattr(F, self.act)(out)
        return out


class DownSample(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=2,
                 padding=1,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 data_format='NCHW'):
        """
        downsample layer

        Args:
            ch_in (int): input channel
            ch_out (int): output channel
            filter_size (int): filter size, default 3
            stride (int): stride, default 2
            padding (int): padding size, default 1
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """

        super(DownSample, self).__init__()

        self.conv_bn_layer = ConvBNLayer(
            ch_in=ch_in,
            ch_out=ch_out,
            filter_size=filter_size,
            stride=stride,
            padding=padding,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)
        self.ch_out = ch_out

    def forward(self, inputs):
        out = self.conv_bn_layer(inputs)
        return out


class BasicBlock(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 data_format='NCHW'):
        """
        BasicBlock layer of DarkNet

        Args:
            ch_in (int): input channel
            ch_out (int): output channel
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """

        super(BasicBlock, self).__init__()

        assert ch_in == ch_out and (ch_in % 2) == 0, \
            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
        # example:
        # --------------{conv1} --> {conv2}
        # channel route: 10-->5 --> 5-->10
        self.conv1 = ConvBNLayer(
            ch_in=ch_in,
            ch_out=int(ch_out / 2),
            filter_size=1,
            stride=1,
            padding=0,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)
        self.conv2 = ConvBNLayer(
            ch_in=int(ch_out / 2),
            ch_out=ch_out,
            filter_size=3,
            stride=1,
            padding=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)

    def forward(self, inputs):
        conv1 = self.conv1(inputs)
        conv2 = self.conv2(conv1)
        out = paddle.add(x=inputs, y=conv2)
        return out


class Blocks(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 count,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 name=None,
                 data_format='NCHW'):
        """
        Blocks layer, which consist of some BaickBlock layers

        Args:
            ch_in (int): input channel
            ch_out (int): output channel
            count (int): number of BasicBlock layer
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
            data_format (str): data format, NCHW or NHWC
        """
        super(Blocks, self).__init__()

        self.basicblock0 = BasicBlock(
            ch_in,
            ch_out,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)
        self.res_out_list = []
        for i in range(1, count):
            block_name = '{}.{}'.format(name, i)
            res_out = self.add_sublayer(
                block_name,
                BasicBlock(
                    ch_out,
                    ch_out,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    data_format=data_format))
            self.res_out_list.append(res_out)
        self.ch_out = ch_out

    def forward(self, inputs):
        y = self.basicblock0(inputs)
        for basic_block_i in self.res_out_list:
            y = basic_block_i(y)
        return y


DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}


@register
@serializable
class DarkNet(nn.Layer):
    __shared__ = ['norm_type', 'data_format']

    def __init__(self,
                 depth=53,
                 freeze_at=-1,
                 return_idx=[2, 3, 4],
                 num_stages=5,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 data_format='NCHW'):
        """
        Darknet, see https://pjreddie.com/darknet/yolo/

        Args:
            depth (int): depth of network
            freeze_at (int): freeze the backbone at which stage
            filter_size (int): filter size, default 3
            return_idx (list): index of stages whose feature maps are returned
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            data_format (str): data format, NCHW or NHWC
        """
        super(DarkNet, self).__init__()
        self.depth = depth
        self.freeze_at = freeze_at
        self.return_idx = return_idx
        self.num_stages = num_stages
        self.stages = DarkNet_cfg[self.depth][0:num_stages]

        self.conv0 = ConvBNLayer(
            ch_in=3,
            ch_out=32,
            filter_size=3,
            stride=1,
            padding=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)

        self.downsample0 = DownSample(
            ch_in=32,
            ch_out=32 * 2,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            data_format=data_format)

        self._out_channels = []
        self.darknet_conv_block_list = []
        self.downsample_list = []
        ch_in = [64, 128, 256, 512, 1024]
        for i, stage in enumerate(self.stages):
            name = 'stage.{}'.format(i)
            conv_block = self.add_sublayer(
                name,
                Blocks(
                    int(ch_in[i]),
                    int(ch_in[i]),
                    stage,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name))
            self.darknet_conv_block_list.append(conv_block)
            if i in return_idx:
                self._out_channels.append(int(ch_in[i]))
        for i in range(num_stages - 1):
            down_name = 'stage.{}.downsample'.format(i)
            downsample = self.add_sublayer(
                down_name,
                DownSample(
                    ch_in=int(ch_in[i]),
                    ch_out=int(ch_in[i + 1]),
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    data_format=data_format))
            self.downsample_list.append(downsample)

    def forward(self, inputs):
        x = inputs['image']

        out = self.conv0(x)
        out = self.downsample0(out)
        blocks = []
        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
            out = conv_block_i(out)
            if i == self.freeze_at:
                out.stop_gradient = True
            if i in self.return_idx:
                blocks.append(out)
            if i < self.num_stages - 1:
                out = self.downsample_list[i](out)
        return blocks

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/dla.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ..shape_spec import ShapeSpec

DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }


class BasicBlock(nn.Layer):
    def __init__(self, ch_in, ch_out, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = ConvNormLayer(
            ch_in,
            ch_out,
            filter_size=3,
            stride=stride,
            bias_on=False,
            norm_decay=None)
        self.conv2 = ConvNormLayer(
            ch_out,
            ch_out,
            filter_size=3,
            stride=1,
            bias_on=False,
            norm_decay=None)

    def forward(self, inputs, residual=None):
        if residual is None:
            residual = inputs

        out = self.conv1(inputs)
        out = F.relu(out)

        out = self.conv2(out)

        out = paddle.add(x=out, y=residual)
        out = F.relu(out)

        return out


class Root(nn.Layer):
    def __init__(self, ch_in, ch_out, kernel_size, residual):
        super(Root, self).__init__()
        self.conv = ConvNormLayer(
            ch_in,
            ch_out,
            filter_size=1,
            stride=1,
            bias_on=False,
            norm_decay=None)
        self.residual = residual

    def forward(self, inputs):
        children = inputs
        out = self.conv(paddle.concat(inputs, axis=1))
        if self.residual:
            out = paddle.add(x=out, y=children[0])
        out = F.relu(out)

        return out


class Tree(nn.Layer):
    def __init__(self,
                 level,
                 block,
                 ch_in,
                 ch_out,
                 stride=1,
                 level_root=False,
                 root_dim=0,
                 root_kernel_size=1,
                 root_residual=False):
        super(Tree, self).__init__()
        if root_dim == 0:
            root_dim = 2 * ch_out
        if level_root:
            root_dim += ch_in
        if level == 1:
            self.tree1 = block(ch_in, ch_out, stride)
            self.tree2 = block(ch_out, ch_out, 1)
        else:
            self.tree1 = Tree(
                level - 1,
                block,
                ch_in,
                ch_out,
                stride,
                root_dim=0,
                root_kernel_size=root_kernel_size,
                root_residual=root_residual)
            self.tree2 = Tree(
                level - 1,
                block,
                ch_out,
                ch_out,
                1,
                root_dim=root_dim + ch_out,
                root_kernel_size=root_kernel_size,
                root_residual=root_residual)

        if level == 1:
            self.root = Root(root_dim, ch_out, root_kernel_size, root_residual)
        self.level_root = level_root
        self.root_dim = root_dim
        self.downsample = None
        self.project = None
        self.level = level
        if stride > 1:
            self.downsample = nn.MaxPool2D(stride, stride=stride)
        if ch_in != ch_out:
            self.project = ConvNormLayer(
                ch_in,
                ch_out,
                filter_size=1,
                stride=1,
                bias_on=False,
                norm_decay=None)

    def forward(self, x, residual=None, children=None):
        children = [] if children is None else children
        bottom = self.downsample(x) if self.downsample else x
        residual = self.project(bottom) if self.project else bottom
        if self.level_root:
            children.append(bottom)
        x1 = self.tree1(x, residual)
        if self.level == 1:
            x2 = self.tree2(x1)
            x = self.root([x2, x1] + children)
        else:
            children.append(x1)
            x = self.tree2(x1, children=children)
        return x


@register
@serializable
class DLA(nn.Layer):
    """
    DLA, see https://arxiv.org/pdf/1707.06484.pdf

    Args:
        depth (int): DLA depth, only support 34 now.
        residual_root (bool): whether use a reidual layer in the root block
        pre_img (bool): add pre_img, only used in CenterTrack
        pre_hm (bool): add pre_hm, only used in CenterTrack
    """

    def __init__(self,
                 depth=34,
                 residual_root=False,
                 pre_img=False,
                 pre_hm=False):
        super(DLA, self).__init__()
        assert depth == 34, 'Only support DLA with depth of 34 now.'
        if depth == 34:
            block = BasicBlock
        levels, channels = DLA_cfg[depth]
        self.channels = channels
        self.num_levels = len(levels)

        self.base_layer = nn.Sequential(
            ConvNormLayer(
                3,
                channels[0],
                filter_size=7,
                stride=1,
                bias_on=False,
                norm_decay=None),
            nn.ReLU())
        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
        self.level1 = self._make_conv_level(
            channels[0], channels[1], levels[1], stride=2)
        self.level2 = Tree(
            levels[2],
            block,
            channels[1],
            channels[2],
            2,
            level_root=False,
            root_residual=residual_root)
        self.level3 = Tree(
            levels[3],
            block,
            channels[2],
            channels[3],
            2,
            level_root=True,
            root_residual=residual_root)
        self.level4 = Tree(
            levels[4],
            block,
            channels[3],
            channels[4],
            2,
            level_root=True,
            root_residual=residual_root)
        self.level5 = Tree(
            levels[5],
            block,
            channels[4],
            channels[5],
            2,
            level_root=True,
            root_residual=residual_root)

        if pre_img:
            self.pre_img_layer = nn.Sequential(
                ConvNormLayer(
                    3,
                    channels[0],
                    filter_size=7,
                    stride=1,
                    bias_on=False,
                    norm_decay=None),
                nn.ReLU())
        if pre_hm:
            self.pre_hm_layer = nn.Sequential(
                ConvNormLayer(
                    1,
                    channels[0],
                    filter_size=7,
                    stride=1,
                    bias_on=False,
                    norm_decay=None),
                nn.ReLU())
        self.pre_img = pre_img
        self.pre_hm = pre_hm

    def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
        modules = []
        for i in range(conv_num):
            modules.extend([
                ConvNormLayer(
                    ch_in,
                    ch_out,
                    filter_size=3,
                    stride=stride if i == 0 else 1,
                    bias_on=False,
                    norm_decay=None), nn.ReLU()
            ])
            ch_in = ch_out
        return nn.Sequential(*modules)

    @property
    def out_shape(self):
        return [
            ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)
        ]

    def forward(self, inputs):
        outs = []
        feats = self.base_layer(inputs['image'])

        if self.pre_img and 'pre_image' in inputs and inputs[
                'pre_image'] is not None:
            feats = feats + self.pre_img_layer(inputs['pre_image'])

        if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:
            feats = feats + self.pre_hm_layer(inputs['pre_hm'])

        for i in range(self.num_levels):
            feats = getattr(self, 'level{}'.format(i))(feats)
            outs.append(feats)

        return outs


================================================
FILE: ppdet/modeling/backbones/esnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
from ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer

__all__ = ['ESNet']


def make_divisible(v, divisor=16, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class SEModule(nn.Layer):
    def __init__(self, channel, reduction=4):
        super(SEModule, self).__init__()
        self.avg_pool = AdaptiveAvgPool2D(1)
        self.conv1 = Conv2D(
            in_channels=channel,
            out_channels=channel // reduction,
            kernel_size=1,
            stride=1,
            padding=0,
            weight_attr=ParamAttr(),
            bias_attr=ParamAttr())
        self.conv2 = Conv2D(
            in_channels=channel // reduction,
            out_channels=channel,
            kernel_size=1,
            stride=1,
            padding=0,
            weight_attr=ParamAttr(),
            bias_attr=ParamAttr())

    def forward(self, inputs):
        outputs = self.avg_pool(inputs)
        outputs = self.conv1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = F.hardsigmoid(outputs)
        return paddle.multiply(x=inputs, y=outputs)


class InvertedResidual(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 stride,
                 act="relu"):
        super(InvertedResidual, self).__init__()
        self._conv_pw = ConvBNLayer(
            in_channels=in_channels // 2,
            out_channels=mid_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        self._conv_dw = ConvBNLayer(
            in_channels=mid_channels // 2,
            out_channels=mid_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=mid_channels // 2,
            act=None)
        self._se = SEModule(mid_channels)

        self._conv_linear = ConvBNLayer(
            in_channels=mid_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)

    def forward(self, inputs):
        x1, x2 = paddle.split(
            inputs,
            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
            axis=1)
        x2 = self._conv_pw(x2)
        x3 = self._conv_dw(x2)
        x3 = paddle.concat([x2, x3], axis=1)
        x3 = self._se(x3)
        x3 = self._conv_linear(x3)
        out = paddle.concat([x1, x3], axis=1)
        return channel_shuffle(out, 2)


class InvertedResidualDS(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 stride,
                 act="relu"):
        super(InvertedResidualDS, self).__init__()

        # branch1
        self._conv_dw_1 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=in_channels,
            act=None)
        self._conv_linear_1 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        # branch2
        self._conv_pw_2 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=mid_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        self._conv_dw_2 = ConvBNLayer(
            in_channels=mid_channels // 2,
            out_channels=mid_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=mid_channels // 2,
            act=None)
        self._se = SEModule(mid_channels // 2)
        self._conv_linear_2 = ConvBNLayer(
            in_channels=mid_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        self._conv_dw_mv1 = ConvBNLayer(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            groups=out_channels,
            act="hard_swish")
        self._conv_pw_mv1 = ConvBNLayer(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act="hard_swish")

    def forward(self, inputs):
        x1 = self._conv_dw_1(inputs)
        x1 = self._conv_linear_1(x1)
        x2 = self._conv_pw_2(inputs)
        x2 = self._conv_dw_2(x2)
        x2 = self._se(x2)
        x2 = self._conv_linear_2(x2)
        out = paddle.concat([x1, x2], axis=1)
        out = self._conv_dw_mv1(out)
        out = self._conv_pw_mv1(out)

        return out


@register
@serializable
class ESNet(nn.Layer):
    def __init__(self,
                 scale=1.0,
                 act="hard_swish",
                 feature_maps=[4, 11, 14],
                 channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]):
        super(ESNet, self).__init__()
        self.scale = scale
        if isinstance(feature_maps, Integral):
            feature_maps = [feature_maps]
        self.feature_maps = feature_maps
        stage_repeats = [3, 7, 3]

        stage_out_channels = [
            -1, 24, make_divisible(128 * scale), make_divisible(256 * scale),
            make_divisible(512 * scale), 1024
        ]

        self._out_channels = []
        self._feature_idx = 0
        # 1. conv1
        self._conv1 = ConvBNLayer(
            in_channels=3,
            out_channels=stage_out_channels[1],
            kernel_size=3,
            stride=2,
            padding=1,
            act=act)
        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
        self._feature_idx += 1

        # 2. bottleneck sequences
        self._block_list = []
        arch_idx = 0
        for stage_id, num_repeat in enumerate(stage_repeats):
            for i in range(num_repeat):
                channels_scales = channel_ratio[arch_idx]
                mid_c = make_divisible(
                    int(stage_out_channels[stage_id + 2] * channels_scales),
                    divisor=8)
                if i == 0:
                    block = self.add_sublayer(
                        name=str(stage_id + 2) + '_' + str(i + 1),
                        sublayer=InvertedResidualDS(
                            in_channels=stage_out_channels[stage_id + 1],
                            mid_channels=mid_c,
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=2,
                            act=act))
                else:
                    block = self.add_sublayer(
                        name=str(stage_id + 2) + '_' + str(i + 1),
                        sublayer=InvertedResidual(
                            in_channels=stage_out_channels[stage_id + 2],
                            mid_channels=mid_c,
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=1,
                            act=act))
                self._block_list.append(block)
                arch_idx += 1
                self._feature_idx += 1
                self._update_out_channels(stage_out_channels[stage_id + 2],
                                          self._feature_idx, self.feature_maps)

    def _update_out_channels(self, channel, feature_idx, feature_maps):
        if feature_idx in feature_maps:
            self._out_channels.append(channel)

    def forward(self, inputs):
        y = self._conv1(inputs['image'])
        y = self._max_pool(y)
        outs = []
        for i, inv in enumerate(self._block_list):
            y = inv(y)
            if i + 2 in self.feature_maps:
                outs.append(y)

        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/focalnet.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
from .swin_transformer import Mlp

__all__ = ['FocalNet']

MODEL_cfg = {
    'focalnet_T_224_1k_srf': dict(
        embed_dim=96,
        depths=[2, 2, 6, 2],
        focal_levels=[2, 2, 2, 2],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.2,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
    ),
    'focalnet_S_224_1k_srf': dict(
        embed_dim=96,
        depths=[2, 2, 18, 2],
        focal_levels=[2, 2, 2, 2],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.3,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
    ),
    'focalnet_B_224_1k_srf': dict(
        embed_dim=128,
        depths=[2, 2, 18, 2],
        focal_levels=[2, 2, 2, 2],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
    ),
    'focalnet_T_224_1k_lrf': dict(
        embed_dim=96,
        depths=[2, 2, 6, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.2,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
    ),
    'focalnet_S_224_1k_lrf': dict(
        embed_dim=96,
        depths=[2, 2, 18, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.3,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
    ),
    'focalnet_B_224_1k_lrf': dict(
        embed_dim=128,
        depths=[2, 2, 18, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=False,
        use_postln=False,
        use_postln_in_modulation=False,
        use_layerscale=False,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
    ),
    'focalnet_L_384_22k_fl3': dict(
        embed_dim=192,
        depths=[2, 2, 18, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[5, 5, 5, 5],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=False,
        use_layerscale=True,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
    ),
    'focalnet_L_384_22k_fl4': dict(
        embed_dim=192,
        depths=[2, 2, 18, 2],
        focal_levels=[4, 4, 4, 4],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=False,
        use_layerscale=True,
        normalize_modulator=True,  #
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
    ),
    'focalnet_XL_384_22k_fl3': dict(
        embed_dim=256,
        depths=[2, 2, 18, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[5, 5, 5, 5],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=False,
        use_layerscale=True,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
    ),
    'focalnet_XL_384_22k_fl4': dict(
        embed_dim=256,
        depths=[2, 2, 18, 2],
        focal_levels=[4, 4, 4, 4],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=False,
        use_layerscale=True,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
    ),
    'focalnet_H_224_22k_fl3': dict(
        embed_dim=352,
        depths=[2, 2, 18, 2],
        focal_levels=[3, 3, 3, 3],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=True,  #
        use_layerscale=True,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
    ),
    'focalnet_H_224_22k_fl4': dict(
        embed_dim=352,
        depths=[2, 2, 18, 2],
        focal_levels=[4, 4, 4, 4],
        focal_windows=[3, 3, 3, 3],
        drop_path_rate=0.5,
        use_conv_embed=True,
        use_postln=True,
        use_postln_in_modulation=True,  #
        use_layerscale=True,
        normalize_modulator=False,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
    ),
}


class FocalModulation(nn.Layer):
    """
    Args:
        dim (int): Number of input channels.
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
        focal_level (int): Number of focal levels
        focal_window (int): Focal window size at focal level 1
        focal_factor (int): Step to increase the focal window. Default: 2
        use_postln_in_modulation (bool): Whether use post-modulation layernorm
        normalize_modulator (bool): Whether use normalize in modulator
    """

    def __init__(self,
                 dim,
                 proj_drop=0.,
                 focal_level=2,
                 focal_window=7,
                 focal_factor=2,
                 use_postln_in_modulation=False,
                 normalize_modulator=False):
        super().__init__()
        self.dim = dim

        # specific args for focalv3
        self.focal_level = focal_level
        self.focal_window = focal_window
        self.focal_factor = focal_factor
        self.use_postln_in_modulation = use_postln_in_modulation
        self.normalize_modulator = normalize_modulator

        self.f = nn.Linear(
            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
        self.h = nn.Conv2D(
            dim,
            dim,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            bias_attr=True)

        self.act = nn.GELU()
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.focal_layers = nn.LayerList()

        if self.use_postln_in_modulation:
            self.ln = nn.LayerNorm(dim)

        for k in range(self.focal_level):
            kernel_size = self.focal_factor * k + self.focal_window
            self.focal_layers.append(
                nn.Sequential(
                    nn.Conv2D(
                        dim,
                        dim,
                        kernel_size=kernel_size,
                        stride=1,
                        groups=dim,
                        padding=kernel_size // 2,
                        bias_attr=False),
                    nn.GELU()))

    def forward(self, x):
        """ Forward function.
        Args:
            x: input features with shape of (B, H, W, C)
        """
        _, _, _, C = x.shape
        x = self.f(x)
        x = x.transpose([0, 3, 1, 2])
        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)

        ctx_all = 0
        for l in range(self.focal_level):
            ctx = self.focal_layers[l](ctx)
            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
        if self.normalize_modulator:
            ctx_all = ctx_all / (self.focal_level + 1)

        x_out = q * self.h(ctx_all)
        x_out = x_out.transpose([0, 2, 3, 1])
        if self.use_postln_in_modulation:
            x_out = self.ln(x_out)
        x_out = self.proj(x_out)
        x_out = self.proj_drop(x_out)
        return x_out


class FocalModulationBlock(nn.Layer):
    """ Focal Modulation Block.
    Args:
        dim (int): Number of input channels.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        drop (float, optional): Dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
        focal_level (int): number of focal levels
        focal_window (int): focal kernel size at level 1
        use_postln (bool): Whether use layernorm after modulation. Default: False.
        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
        normalize_modulator (bool): Whether use normalize in modulator
        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
        layerscale_value (float): Value for layer scale. Default: 1e-4 
    """

    def __init__(self,
                 dim,
                 mlp_ratio=4.,
                 drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 focal_level=2,
                 focal_window=9,
                 use_postln=False,
                 use_postln_in_modulation=False,
                 normalize_modulator=False,
                 use_layerscale=False,
                 layerscale_value=1e-4):
        super().__init__()
        self.dim = dim
        self.mlp_ratio = mlp_ratio
        self.focal_window = focal_window
        self.focal_level = focal_level
        self.use_postln = use_postln
        self.use_layerscale = use_layerscale

        self.norm1 = norm_layer(dim)
        self.modulation = FocalModulation(
            dim,
            proj_drop=drop,
            focal_level=self.focal_level,
            focal_window=self.focal_window,
            use_postln_in_modulation=use_postln_in_modulation,
            normalize_modulator=normalize_modulator)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)
        self.H = None
        self.W = None

        self.gamma_1 = 1.0
        self.gamma_2 = 1.0
        if self.use_layerscale:
            self.gamma_1 = add_parameter(self,
                                         layerscale_value * paddle.ones([dim]))
            self.gamma_2 = add_parameter(self,
                                         layerscale_value * paddle.ones([dim]))

    def forward(self, x):
        """
        Args:
            x: Input feature, tensor size (B, H*W, C).
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, "input feature has wrong size"

        shortcut = x
        if not self.use_postln:
            x = self.norm1(x)
        x = x.reshape([-1, H, W, C])

        # FM
        x = self.modulation(x).reshape([-1, H * W, C])
        if self.use_postln:
            x = self.norm1(x)

        # FFN
        x = shortcut + self.drop_path(self.gamma_1 * x)

        if self.use_postln:
            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
        else:
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x


class BasicLayer(nn.Layer):
    """ A basic focal modulation layer for one stage.
    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        drop (float, optional): Dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
        focal_level (int): Number of focal levels
        focal_window (int): Focal window size at focal level 1
        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
        layerscale_value (float): Value of layerscale
        use_postln (bool): Whether use layernorm after modulation. Default: False.
        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
        normalize_modulator (bool): Whether use normalize in modulator
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 mlp_ratio=4.,
                 drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 focal_level=2,
                 focal_window=9,
                 use_conv_embed=False,
                 use_layerscale=False,
                 layerscale_value=1e-4,
                 use_postln=False,
                 use_postln_in_modulation=False,
                 normalize_modulator=False,
                 use_checkpoint=False):
        super().__init__()
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.LayerList([
            FocalModulationBlock(
                dim=dim,
                mlp_ratio=mlp_ratio,
                drop=drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, np.ndarray) else drop_path,
                act_layer=nn.GELU,
                norm_layer=norm_layer,
                focal_level=focal_level,
                focal_window=focal_window,
                use_postln=use_postln,
                use_postln_in_modulation=use_postln_in_modulation,
                normalize_modulator=normalize_modulator,
                use_layerscale=use_layerscale,
                layerscale_value=layerscale_value) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(
                patch_size=2,
                in_chans=dim,
                embed_dim=2 * dim,
                use_conv_embed=use_conv_embed,
                norm_layer=norm_layer,
                is_stem=False)
        else:
            self.downsample = None

    def forward(self, x, H, W):
        """
        Args:
            x: Input feature, tensor size (B, H*W, C).
        """
        for blk in self.blocks:
            blk.H, blk.W = H, W
            x = blk(x)

        if self.downsample is not None:
            x_reshaped = x.transpose([0, 2, 1]).reshape(
                [x.shape[0], x.shape[-1], H, W])
            x_down = self.downsample(x_reshaped)
            x_down = x_down.flatten(2).transpose([0, 2, 1])
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


class PatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Layer, optional): Normalization layer. Default: None
        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
        is_stem (bool): Is the stem block or not. 
    """

    def __init__(self,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 norm_layer=None,
                 use_conv_embed=False,
                 is_stem=False):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        if use_conv_embed:
            # if we choose to use conv embedding, then we treat the stem and non-stem differently
            if is_stem:
                kernel_size = 7
                padding = 2
                stride = 4
            else:
                kernel_size = 3
                padding = 1
                stride = 2
            self.proj = nn.Conv2D(
                in_chans,
                embed_dim,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding)
        else:
            self.proj = nn.Conv2D(
                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        _, _, H, W = x.shape

        if W % self.patch_size[1] != 0:
            # for 3D tensor: [pad_left, pad_right]
            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
            W += W % self.patch_size[1]
        if H % self.patch_size[0] != 0:
            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
            H += H % self.patch_size[0]

        x = self.proj(x)
        if self.norm is not None:
            _, _, Wh, Ww = x.shape
            x = x.flatten(2).transpose([0, 2, 1])
            x = self.norm(x)
            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])

        return x


@register
@serializable
class FocalNet(nn.Layer):
    """ FocalNet backbone
    Args:
        arch (str): Architecture of FocalNet
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each FocalNet Transformer stage.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        drop_rate (float): Dropout rate.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        focal_levels (Sequence[int]): Number of focal levels at four stages
        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
        layerscale_value (float): Value of layerscale
        use_postln (bool): Whether use layernorm after modulation. Default: False.
        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
        normalize_modulator (bool): Whether use normalize in modulator
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(
            self,
            arch='focalnet_T_224_1k_srf',
            out_indices=(0, 1, 2, 3),
            frozen_stages=-1,
            patch_size=4,
            in_chans=3,
            embed_dim=96,
            depths=[2, 2, 6, 2],
            mlp_ratio=4.,
            drop_rate=0.,
            drop_path_rate=0.2,  # 0.5 better for large+ models
            norm_layer=nn.LayerNorm,
            patch_norm=True,
            focal_levels=[2, 2, 2, 2],
            focal_windows=[3, 3, 3, 3],
            use_conv_embed=False,
            use_layerscale=False,
            layerscale_value=1e-4,
            use_postln=False,
            use_postln_in_modulation=False,
            normalize_modulator=False,
            use_checkpoint=False,
            pretrained=None):
        super(FocalNet, self).__init__()
        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)

        embed_dim = MODEL_cfg[arch]['embed_dim']
        depths = MODEL_cfg[arch]['depths']
        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
        focal_levels = MODEL_cfg[arch]['focal_levels']
        focal_windows = MODEL_cfg[arch]['focal_windows']
        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
        use_layerscale = MODEL_cfg[arch]['use_layerscale']
        use_postln = MODEL_cfg[arch]['use_postln']
        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
        if pretrained is None:
            pretrained = MODEL_cfg[arch]['pretrained']

        self.out_indices = out_indices
        self.frozen_stages = frozen_stages
        self.num_layers = len(depths)
        self.patch_norm = patch_norm

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None,
            use_conv_embed=use_conv_embed,
            is_stem=True)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth decay rule
        dpr = np.linspace(0, drop_path_rate, sum(depths))

        # build layers
        self.layers = nn.LayerList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                depth=depths[i_layer],
                mlp_ratio=mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchEmbed
                if (i_layer < self.num_layers - 1) else None,
                focal_level=focal_levels[i_layer],
                focal_window=focal_windows[i_layer],
                use_conv_embed=use_conv_embed,
                use_layerscale=use_layerscale,
                layerscale_value=layerscale_value,
                use_postln=use_postln,
                use_postln_in_modulation=use_postln_in_modulation,
                normalize_modulator=normalize_modulator,
                use_checkpoint=use_checkpoint)
            self.layers.append(layer)

        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
        self.num_features = num_features

        # add a norm layer for each output
        for i_layer in out_indices:
            layer = norm_layer(num_features[i_layer])
            layer_name = f'norm{i_layer}'
            self.add_sublayer(layer_name, layer)

        self.apply(self._init_weights)
        self._freeze_stages()
        if pretrained:
            if 'http' in pretrained:  #URL
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:  #model in local path
                path = pretrained
            self.set_state_dict(paddle.load(path))

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.stop_gradient = True

        if self.frozen_stages >= 2:
            self.pos_drop.eval()
            for i in range(0, self.frozen_stages - 1):
                m = self.layers[i]
                m.eval()
                for param in m.parameters():
                    param.stop_gradient = True

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            zeros_(m.bias)
            ones_(m.weight)

    def forward(self, x):
        x = self.patch_embed(x['image'])
        B, _, Wh, Ww = x.shape
        x = x.flatten(2).transpose([0, 2, 1])
        x = self.pos_drop(x)
        outs = []
        for i in range(self.num_layers):
            layer = self.layers[i]
            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                x_out = norm_layer(x_out)
                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
                    (0, 3, 1, 2))
                outs.append(out)

        return outs

    @property
    def out_shape(self):
        out_strides = [4, 8, 16, 32]
        return [
            ShapeSpec(
                channels=self.num_features[i], stride=out_strides[i])
            for i in self.out_indices
        ]


================================================
FILE: ppdet/modeling/backbones/ghostnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import AdaptiveAvgPool2D, Linear
from paddle.nn.initializer import Uniform

from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from .mobilenet_v3 import make_divisible, ConvBNLayer

__all__ = ['GhostNet']


class ExtraBlockDW(nn.Layer):
    def __init__(self,
                 in_c,
                 ch_1,
                 ch_2,
                 stride,
                 lr_mult,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 name=None):
        super(ExtraBlockDW, self).__init__()
        self.pointwise_conv = ConvBNLayer(
            in_c=in_c,
            out_c=ch_1,
            filter_size=1,
            stride=1,
            padding=0,
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra1")
        self.depthwise_conv = ConvBNLayer(
            in_c=ch_1,
            out_c=ch_2,
            filter_size=3,
            stride=stride,
            padding=1,  #
            num_groups=int(ch_1),
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra2_dw")
        self.normal_conv = ConvBNLayer(
            in_c=ch_2,
            out_c=ch_2,
            filter_size=1,
            stride=1,
            padding=0,
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra2_sep")

    def forward(self, inputs):
        x = self.pointwise_conv(inputs)
        x = self.depthwise_conv(x)
        x = self.normal_conv(x)
        return x


class SEBlock(nn.Layer):
    def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None):
        super(SEBlock, self).__init__()
        self.pool2d_gap = AdaptiveAvgPool2D(1)
        self._num_channels = num_channels
        stdv = 1.0 / math.sqrt(num_channels * 1.0)
        med_ch = num_channels // reduction_ratio
        self.squeeze = Linear(
            num_channels,
            med_ch,
            weight_attr=ParamAttr(
                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
            bias_attr=ParamAttr(learning_rate=lr_mult))
        stdv = 1.0 / math.sqrt(med_ch * 1.0)
        self.excitation = Linear(
            med_ch,
            num_channels,
            weight_attr=ParamAttr(
                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
            bias_attr=ParamAttr(learning_rate=lr_mult))

    def forward(self, inputs):
        pool = self.pool2d_gap(inputs)
        pool = paddle.squeeze(pool, axis=[2, 3])
        squeeze = self.squeeze(pool)
        squeeze = F.relu(squeeze)
        excitation = self.excitation(squeeze)
        excitation = paddle.clip(x=excitation, min=0, max=1)
        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
        out = paddle.multiply(inputs, excitation)
        return out


class GhostModule(nn.Layer):
    def __init__(self,
                 in_channels,
                 output_channels,
                 kernel_size=1,
                 ratio=2,
                 dw_size=3,
                 stride=1,
                 relu=True,
                 lr_mult=1.,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 name=None):
        super(GhostModule, self).__init__()
        init_channels = int(math.ceil(output_channels / ratio))
        new_channels = int(init_channels * (ratio - 1))
        self.primary_conv = ConvBNLayer(
            in_c=in_channels,
            out_c=init_channels,
            filter_size=kernel_size,
            stride=stride,
            padding=int((kernel_size - 1) // 2),
            num_groups=1,
            act="relu" if relu else None,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_primary_conv")
        self.cheap_operation = ConvBNLayer(
            in_c=init_channels,
            out_c=new_channels,
            filter_size=dw_size,
            stride=1,
            padding=int((dw_size - 1) // 2),
            num_groups=init_channels,
            act="relu" if relu else None,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_cheap_operation")

    def forward(self, inputs):
        x = self.primary_conv(inputs)
        y = self.cheap_operation(x)
        out = paddle.concat([x, y], axis=1)
        return out


class GhostBottleneck(nn.Layer):
    def __init__(self,
                 in_channels,
                 hidden_dim,
                 output_channels,
                 kernel_size,
                 stride,
                 use_se,
                 lr_mult,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 return_list=False,
                 name=None):
        super(GhostBottleneck, self).__init__()
        self._stride = stride
        self._use_se = use_se
        self._num_channels = in_channels
        self._output_channels = output_channels
        self.return_list = return_list

        self.ghost_module_1 = GhostModule(
            in_channels=in_channels,
            output_channels=hidden_dim,
            kernel_size=1,
            stride=1,
            relu=True,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_ghost_module_1")
        if stride == 2:
            self.depthwise_conv = ConvBNLayer(
                in_c=hidden_dim,
                out_c=hidden_dim,
                filter_size=kernel_size,
                stride=stride,
                padding=int((kernel_size - 1) // 2),
                num_groups=hidden_dim,
                act=None,
                lr_mult=lr_mult,
                conv_decay=conv_decay,
                norm_type=norm_type,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                name=name +
                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
            )
        if use_se:
            self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se")
        self.ghost_module_2 = GhostModule(
            in_channels=hidden_dim,
            output_channels=output_channels,
            kernel_size=1,
            relu=False,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_ghost_module_2")
        if stride != 1 or in_channels != output_channels:
            self.shortcut_depthwise = ConvBNLayer(
                in_c=in_channels,
                out_c=in_channels,
                filter_size=kernel_size,
                stride=stride,
                padding=int((kernel_size - 1) // 2),
                num_groups=in_channels,
                act=None,
                lr_mult=lr_mult,
                conv_decay=conv_decay,
                norm_type=norm_type,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                name=name +
                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
            )
            self.shortcut_conv = ConvBNLayer(
                in_c=in_channels,
                out_c=output_channels,
                filter_size=1,
                stride=1,
                padding=0,
                num_groups=1,
                act=None,
                lr_mult=lr_mult,
                conv_decay=conv_decay,
                norm_type=norm_type,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                name=name + "_shortcut_conv")

    def forward(self, inputs):
        y = self.ghost_module_1(inputs)
        x = y
        if self._stride == 2:
            x = self.depthwise_conv(x)
        if self._use_se:
            x = self.se_block(x)
        x = self.ghost_module_2(x)

        if self._stride == 1 and self._num_channels == self._output_channels:
            shortcut = inputs
        else:
            shortcut = self.shortcut_depthwise(inputs)
            shortcut = self.shortcut_conv(shortcut)
        x = paddle.add(x=x, y=shortcut)

        if self.return_list:
            return [y, x]
        else:
            return x


@register
@serializable
class GhostNet(nn.Layer):
    __shared__ = ['norm_type']

    def __init__(
            self,
            scale=1.3,
            feature_maps=[6, 12, 15],
            with_extra_blocks=False,
            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
            conv_decay=0.,
            norm_type='bn',
            norm_decay=0.0,
            freeze_norm=False):
        super(GhostNet, self).__init__()
        if isinstance(feature_maps, Integral):
            feature_maps = [feature_maps]
        if norm_type == 'sync_bn' and freeze_norm:
            raise ValueError(
                "The norm_type should not be sync_bn when freeze_norm is True")
        self.feature_maps = feature_maps
        self.with_extra_blocks = with_extra_blocks
        self.extra_block_filters = extra_block_filters

        inplanes = 16
        self.cfgs = [
            # k, t, c, SE, s
            [3, 16, 16, 0, 1],
            [3, 48, 24, 0, 2],
            [3, 72, 24, 0, 1],
            [5, 72, 40, 1, 2],
            [5, 120, 40, 1, 1],
            [3, 240, 80, 0, 2],
            [3, 200, 80, 0, 1],
            [3, 184, 80, 0, 1],
            [3, 184, 80, 0, 1],
            [3, 480, 112, 1, 1],
            [3, 672, 112, 1, 1],
            [5, 672, 160, 1, 2],  # SSDLite output
            [5, 960, 160, 0, 1],
            [5, 960, 160, 1, 1],
            [5, 960, 160, 0, 1],
            [5, 960, 160, 1, 1]
        ]
        self.scale = scale
        conv1_out_ch = int(make_divisible(inplanes * self.scale, 4))
        self.conv1 = ConvBNLayer(
            in_c=3,
            out_c=conv1_out_ch,
            filter_size=3,
            stride=2,
            padding=1,
            num_groups=1,
            act="relu",
            lr_mult=1.,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="conv1")

        # build inverted residual blocks
        self._out_channels = []
        self.ghost_bottleneck_list = []
        idx = 0
        inplanes = conv1_out_ch
        for k, exp_size, c, use_se, s in self.cfgs:
            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
            lr_mult = lr_mult_list[lr_idx]

            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
            return_list = self.with_extra_blocks and idx + 2 in self.feature_maps

            ghost_bottleneck = self.add_sublayer(
                "_ghostbottleneck_" + str(idx),
                sublayer=GhostBottleneck(
                    in_channels=inplanes,
                    hidden_dim=int(make_divisible(exp_size * self.scale, 4)),
                    output_channels=int(make_divisible(c * self.scale, 4)),
                    kernel_size=k,
                    stride=s,
                    use_se=use_se,
                    lr_mult=lr_mult,
                    conv_decay=conv_decay,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    return_list=return_list,
                    name="_ghostbottleneck_" + str(idx)))
            self.ghost_bottleneck_list.append(ghost_bottleneck)
            inplanes = int(make_divisible(c * self.scale, 4))
            idx += 1
            self._update_out_channels(
                int(make_divisible(exp_size * self.scale, 4))
                if return_list else inplanes, idx + 1, feature_maps)

        if self.with_extra_blocks:
            self.extra_block_list = []
            extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4))
            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
            lr_mult = lr_mult_list[lr_idx]

            conv_extra = self.add_sublayer(
                "conv" + str(idx + 2),
                sublayer=ConvBNLayer(
                    in_c=inplanes,
                    out_c=extra_out_c,
                    filter_size=1,
                    stride=1,
                    padding=0,
                    num_groups=1,
                    act="relu6",
                    lr_mult=lr_mult,
                    conv_decay=conv_decay,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    name="conv" + str(idx + 2)))
            self.extra_block_list.append(conv_extra)
            idx += 1
            self._update_out_channels(extra_out_c, idx + 1, feature_maps)

            for j, block_filter in enumerate(self.extra_block_filters):
                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
                                                                           1][1]
                conv_extra = self.add_sublayer(
                    "conv" + str(idx + 2),
                    sublayer=ExtraBlockDW(
                        in_c,
                        block_filter[0],
                        block_filter[1],
                        stride=2,
                        lr_mult=lr_mult,
                        conv_decay=conv_decay,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        name='conv' + str(idx + 2)))
                self.extra_block_list.append(conv_extra)
                idx += 1
                self._update_out_channels(block_filter[1], idx + 1,
                                          feature_maps)

    def _update_out_channels(self, channel, feature_idx, feature_maps):
        if feature_idx in feature_maps:
            self._out_channels.append(channel)

    def forward(self, inputs):
        x = self.conv1(inputs['image'])
        outs = []
        for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):
            x = ghost_bottleneck(x)
            if idx + 2 in self.feature_maps:
                if isinstance(x, list):
                    outs.append(x[0])
                    x = x[1]
                else:
                    outs.append(x)

        if not self.with_extra_blocks:
            return outs

        for i, block in enumerate(self.extra_block_list):
            idx = i + len(self.ghost_bottleneck_list)
            x = block(x)
            if idx + 2 in self.feature_maps:
                outs.append(x)
        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/hardnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec

__all__ = ['HarDNet']


def ConvLayer(in_channels,
              out_channels,
              kernel_size=3,
              stride=1,
              bias_attr=False):
    layer = nn.Sequential(
        ('conv', nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=kernel_size // 2,
            groups=1,
            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
        ('relu', nn.ReLU6()))
    return layer


def DWConvLayer(in_channels,
                out_channels,
                kernel_size=3,
                stride=1,
                bias_attr=False):
    layer = nn.Sequential(
        ('dwconv', nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=1,
            groups=out_channels,
            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
    return layer


def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
    layer = nn.Sequential(
        ('layer1', ConvLayer(
            in_channels, out_channels, kernel_size=kernel_size)),
        ('layer2', DWConvLayer(
            out_channels, out_channels, stride=stride)))
    return layer


class HarDBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 growth_rate,
                 grmul,
                 n_layers,
                 keepBase=False,
                 residual_out=False,
                 dwconv=False):
        super().__init__()
        self.keepBase = keepBase
        self.links = []
        layers_ = []
        self.out_channels = 0
        for i in range(n_layers):
            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
                                              grmul)
            self.links.append(link)
            if dwconv:
                layers_.append(CombConvLayer(inch, outch))
            else:
                layers_.append(ConvLayer(inch, outch))

            if (i % 2 == 0) or (i == n_layers - 1):
                self.out_channels += outch
        self.layers = nn.LayerList(layers_)

    def get_out_ch(self):
        return self.out_channels

    def get_link(self, layer, base_ch, growth_rate, grmul):
        if layer == 0:
            return base_ch, 0, []
        out_channels = growth_rate

        link = []
        for i in range(10):
            dv = 2**i
            if layer % dv == 0:
                k = layer - dv
                link.append(k)
                if i > 0:
                    out_channels *= grmul

        out_channels = int(int(out_channels + 1) / 2) * 2
        in_channels = 0

        for i in link:
            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
            in_channels += ch

        return out_channels, in_channels, link

    def forward(self, x):
        layers_ = [x]

        for layer in range(len(self.layers)):
            link = self.links[layer]
            tin = []
            for i in link:
                tin.append(layers_[i])
            if len(tin) > 1:
                x = paddle.concat(tin, 1)
            else:
                x = tin[0]
            out = self.layers[layer](x)
            layers_.append(out)

        t = len(layers_)
        out_ = []
        for i in range(t):
            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
                out_.append(layers_[i])
        out = paddle.concat(out_, 1)

        return out


@register
class HarDNet(nn.Layer):
    def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):
        super(HarDNet, self).__init__()
        assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch)
        if arch == 85:
            first_ch = [48, 96]
            second_kernel = 3
            ch_list = [192, 256, 320, 480, 720]
            grmul = 1.7
            gr = [24, 24, 28, 36, 48]
            n_layers = [8, 16, 16, 16, 16]
        elif arch == 68:
            first_ch = [32, 64]
            second_kernel = 3
            ch_list = [128, 256, 320, 640]
            grmul = 1.7
            gr = [14, 16, 20, 40]
            n_layers = [8, 16, 16, 16]
        else:
            raise ValueError("HarDNet-{} is not supported.".format(arch))

        self.return_idx = return_idx
        self._out_channels = [96, 214, 458, 784]

        avg_pool = True
        if depth_wise:
            second_kernel = 1
            avg_pool = False

        blks = len(n_layers)
        self.base = nn.LayerList([])

        # First Layer: Standard Conv3x3, Stride=2
        self.base.append(
            ConvLayer(
                in_channels=3,
                out_channels=first_ch[0],
                kernel_size=3,
                stride=2,
                bias_attr=False))

        # Second Layer
        self.base.append(
            ConvLayer(
                first_ch[0], first_ch[1], kernel_size=second_kernel))

        # Avgpooling or DWConv3x3 downsampling
        if avg_pool:
            self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
        else:
            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))

        # Build all HarDNet blocks
        ch = first_ch[1]
        for i in range(blks):
            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
            ch = blk.out_channels
            self.base.append(blk)

            if i != blks - 1:
                self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
            ch = ch_list[i]
            if i == 0:
                self.base.append(
                    nn.AvgPool2D(
                        kernel_size=2, stride=2, ceil_mode=True))
            elif i != blks - 1 and i != 1 and i != 3:
                self.base.append(nn.AvgPool2D(kernel_size=2, stride=2))

    def forward(self, inputs):
        x = inputs['image']
        outs = []
        for i, layer in enumerate(self.base):
            x = layer(x)
            if i in self.return_idx:
                outs.append(x)
        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)]


================================================
FILE: ppdet/modeling/backbones/hgnet_v2.py
================================================
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal, Constant
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
from paddle.regularizer import L2Decay
from paddle import ParamAttr

import copy

from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['PPHGNetV2']

kaiming_normal_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)


class LearnableAffineBlock(nn.Layer):
    def __init__(self,
                 scale_value=1.0,
                 bias_value=0.0,
                 lr_mult=1.0,
                 lab_lr=0.01):
        super().__init__()
        self.scale = self.create_parameter(
            shape=[1, ],
            default_initializer=Constant(value=scale_value),
            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
        self.add_parameter("scale", self.scale)
        self.bias = self.create_parameter(
            shape=[1, ],
            default_initializer=Constant(value=bias_value),
            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
        self.add_parameter("bias", self.bias)

    def forward(self, x):
        return self.scale * x + self.bias


class ConvBNAct(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 stride=1,
                 padding=1,
                 groups=1,
                 use_act=True,
                 use_lab=False,
                 lr_mult=1.0):
        super().__init__()
        self.use_act = use_act
        self.use_lab = use_lab
        self.conv = Conv2D(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding=padding
            if isinstance(padding, str) else (kernel_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=False)
        self.bn = BatchNorm2D(
            out_channels,
            weight_attr=ParamAttr(
                regularizer=L2Decay(0.0), learning_rate=lr_mult),
            bias_attr=ParamAttr(
                regularizer=L2Decay(0.0), learning_rate=lr_mult))
        if self.use_act:
            self.act = ReLU()
            if self.use_lab:
                self.lab = LearnableAffineBlock(lr_mult=lr_mult)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.use_act:
            x = self.act(x)
            if self.use_lab:
                x = self.lab(x)
        return x


class LightConvBNAct(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 groups=1,
                 use_lab=False,
                 lr_mult=1.0):
        super().__init__()
        self.conv1 = ConvBNAct(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_act=False,
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.conv2 = ConvBNAct(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            groups=out_channels,
            use_act=True,
            use_lab=use_lab,
            lr_mult=lr_mult)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return x


class StemBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 use_lab=False,
                 lr_mult=1.0):
        super().__init__()
        self.stem1 = ConvBNAct(
            in_channels=in_channels,
            out_channels=mid_channels,
            kernel_size=3,
            stride=2,
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.stem2a = ConvBNAct(
            in_channels=mid_channels,
            out_channels=mid_channels // 2,
            kernel_size=2,
            stride=1,
            padding="SAME",
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.stem2b = ConvBNAct(
            in_channels=mid_channels // 2,
            out_channels=mid_channels,
            kernel_size=2,
            stride=1,
            padding="SAME",
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.stem3 = ConvBNAct(
            in_channels=mid_channels * 2,
            out_channels=mid_channels,
            kernel_size=3,
            stride=2,
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.stem4 = ConvBNAct(
            in_channels=mid_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.pool = nn.MaxPool2D(
            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")

    def forward(self, x):
        x = self.stem1(x)
        x2 = self.stem2a(x)
        x2 = self.stem2b(x2)
        x1 = self.pool(x)
        x = paddle.concat([x1, x2], 1)
        x = self.stem3(x)
        x = self.stem4(x)

        return x


class HG_Block(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 kernel_size=3,
                 layer_num=6,
                 identity=False,
                 light_block=True,
                 use_lab=False,
                 lr_mult=1.0):
        super().__init__()
        self.identity = identity

        self.layers = nn.LayerList()
        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
        for i in range(layer_num):
            self.layers.append(
                eval(block_type)(in_channels=in_channels
                                 if i == 0 else mid_channels,
                                 out_channels=mid_channels,
                                 stride=1,
                                 kernel_size=kernel_size,
                                 use_lab=use_lab,
                                 lr_mult=lr_mult))
        # feature aggregation
        total_channels = in_channels + layer_num * mid_channels
        self.aggregation_squeeze_conv = ConvBNAct(
            in_channels=total_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            use_lab=use_lab,
            lr_mult=lr_mult)
        self.aggregation_excitation_conv = ConvBNAct(
            in_channels=out_channels // 2,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            use_lab=use_lab,
            lr_mult=lr_mult)

    def forward(self, x):
        identity = x
        output = []
        output.append(x)
        for layer in self.layers:
            x = layer(x)
            output.append(x)
        x = paddle.concat(output, axis=1)
        x = self.aggregation_squeeze_conv(x)
        x = self.aggregation_excitation_conv(x)
        if self.identity:
            x += identity
        return x


class HG_Stage(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 block_num,
                 layer_num=6,
                 downsample=True,
                 light_block=True,
                 kernel_size=3,
                 use_lab=False,
                 lr_mult=1.0):
        super().__init__()
        self.downsample = downsample
        if downsample:
            self.downsample = ConvBNAct(
                in_channels=in_channels,
                out_channels=in_channels,
                kernel_size=3,
                stride=2,
                groups=in_channels,
                use_act=False,
                use_lab=use_lab,
                lr_mult=lr_mult)

        blocks_list = []
        for i in range(block_num):
            blocks_list.append(
                HG_Block(
                    in_channels=in_channels if i == 0 else out_channels,
                    mid_channels=mid_channels,
                    out_channels=out_channels,
                    kernel_size=kernel_size,
                    layer_num=layer_num,
                    identity=False if i == 0 else True,
                    light_block=light_block,
                    use_lab=use_lab,
                    lr_mult=lr_mult))
        self.blocks = nn.Sequential(*blocks_list)

    def forward(self, x):
        if self.downsample:
            x = self.downsample(x)
        x = self.blocks(x)
        return x


def _freeze_norm(m: nn.BatchNorm2D):
    param_attr = ParamAttr(
        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
    bias_attr = ParamAttr(
        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
    global_stats = True
    norm = nn.BatchNorm2D(
        m._num_features,
        weight_attr=param_attr,
        bias_attr=bias_attr,
        use_global_stats=global_stats)
    for param in norm.parameters():
        param.stop_gradient = True
    return norm


def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
    if isinstance(model, nn.BatchNorm2D):
        model = reset_func(model)
    else:
        for name, child in model.named_children():
            _child = reset_bn(child, reset_func)
            if _child is not child:
                setattr(model, name, _child)
    return model


@register
@serializable
class PPHGNetV2(nn.Layer):
    """
    PPHGNetV2
    Args:
        stem_channels: list. Number of channels for the stem block.
        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
        use_lab: boolean. Whether to use LearnableAffineBlock in network.
        lr_mult_list: list. Control the learning rate of different stages.
    Returns:
        model: nn.Layer. Specific PPHGNetV2 model depends on args.
    """

    arch_configs = {
        'S': {
            'stem_channels': [3, 24, 32],
            'stage_config': {
                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
                "stage1": [32, 32, 64, 1, False, False, 3, 3],
                "stage2": [64, 48, 256, 1, True, False, 3, 3],
                "stage3": [256, 96, 512, 2, True, True, 5, 3],
                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
            }
        },
        'M': {
            'stem_channels': [3, 24, 32],
            'stage_config': {
                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
                "stage1": [32, 32, 96, 1, False, False, 3, 4],
                "stage2": [96, 64, 384, 1, True, False, 3, 4],
                "stage3": [384, 128, 768, 3, True, True, 5, 4],
                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
            }
        },
        'L': {
            'stem_channels': [3, 32, 48],
            'stage_config': {
                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
                "stage1": [48, 48, 128, 1, False, False, 3, 6],
                "stage2": [128, 96, 512, 1, True, False, 3, 6],
                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
            }
        },
        'X': {
            'stem_channels': [3, 32, 64],
            'stage_config': {
                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
                "stage1": [64, 64, 128, 1, False, False, 3, 6],
                "stage2": [128, 128, 512, 2, True, False, 3, 6],
                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
            }
        },
        'H': {
            'stem_channels': [3, 48, 96],
            'stage_config': {
                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
                "stage1": [96, 96, 192, 2, False, False, 3, 6],
                "stage2": [192, 192, 512, 3, True, False, 3, 6],
                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
            }
        }
    }

    def __init__(self,
                 arch,
                 use_lab=False,
                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
                 return_idx=[1, 2, 3],
                 freeze_stem_only=True,
                 freeze_at=0,
                 freeze_norm=True):
        super().__init__()
        self.use_lab = use_lab
        self.return_idx = return_idx

        stem_channels = self.arch_configs[arch]['stem_channels']
        stage_config = self.arch_configs[arch]['stage_config']

        self._out_strides = [4, 8, 16, 32]
        self._out_channels = [stage_config[k][2] for k in stage_config]

        # stem
        self.stem = StemBlock(
            in_channels=stem_channels[0],
            mid_channels=stem_channels[1],
            out_channels=stem_channels[2],
            use_lab=use_lab,
            lr_mult=lr_mult_list[0])

        # stages
        self.stages = nn.LayerList()
        for i, k in enumerate(stage_config):
            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
                k]
            self.stages.append(
                HG_Stage(
                    in_channels,
                    mid_channels,
                    out_channels,
                    block_num,
                    layer_num,
                    downsample,
                    light_block,
                    kernel_size,
                    use_lab,
                    lr_mult=lr_mult_list[i + 1]))

        if freeze_at >= 0:
            self._freeze_parameters(self.stem)
            if not freeze_stem_only:
                for i in range(min(freeze_at + 1, len(self.stages))):
                    self._freeze_parameters(self.stages[i])

        if freeze_norm:
            reset_bn(self, reset_func=_freeze_norm)

        self._init_weights()

    def _freeze_parameters(self, m):
        for p in m.parameters():
            p.stop_gradient = True

    def _init_weights(self):
        for m in self.sublayers():
            if isinstance(m, nn.Conv2D):
                kaiming_normal_(m.weight)
            elif isinstance(m, (nn.BatchNorm2D)):
                ones_(m.weight)
                zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                zeros_(m.bias)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]

    def forward(self, inputs):
        x = inputs['image']
        x = self.stem(x)
        outs = []
        for idx, stage in enumerate(self.stages):
            x = stage(x)
            if idx in self.return_idx:
                outs.append(x)
        return outs


================================================
FILE: ppdet/modeling/backbones/hrnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import AdaptiveAvgPool2D, Linear
from paddle.regularizer import L2Decay
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Uniform
from numbers import Integral
import math

from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec

__all__ = ['HRNet']


class ConvNormLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 stride=1,
                 norm_type='bn',
                 norm_groups=32,
                 use_dcn=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=False,
                 act=None,
                 name=None):
        super(ConvNormLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'gn']

        self.act = act
        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=1,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0., std=0.01)),
            bias_attr=False)

        norm_lr = 0. if freeze_norm else 1.

        param_attr = ParamAttr(
            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
        bias_attr = ParamAttr(
            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
        global_stats = True if freeze_norm else None
        if norm_type in ['bn', 'sync_bn']:
            self.norm = nn.BatchNorm2D(
                ch_out,
                momentum=norm_momentum,
                weight_attr=param_attr,
                bias_attr=bias_attr,
                use_global_stats=global_stats)
        elif norm_type == 'gn':
            self.norm = nn.GroupNorm(
                num_groups=norm_groups,
                num_channels=ch_out,
                weight_attr=param_attr,
                bias_attr=bias_attr)
        norm_params = self.norm.parameters()
        if freeze_norm:
            for param in norm_params:
                param.stop_gradient = True

    def forward(self, inputs):
        out = self.conv(inputs)
        out = self.norm(out)

        if self.act == 'relu':
            out = F.relu(out)
        return out


class Layer1(nn.Layer):
    def __init__(self,
                 num_channels,
                 has_se=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(Layer1, self).__init__()

        self.bottleneck_block_list = []

        for i in range(4):
            bottleneck_block = self.add_sublayer(
                "block_{}_{}".format(name, i + 1),
                BottleneckBlock(
                    num_channels=num_channels if i == 0 else 256,
                    num_filters=64,
                    has_se=has_se,
                    stride=1,
                    downsample=True if i == 0 else False,
                    norm_momentum=norm_momentum,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    name=name + '_' + str(i + 1)))
            self.bottleneck_block_list.append(bottleneck_block)

    def forward(self, input):
        conv = input
        for block_func in self.bottleneck_block_list:
            conv = block_func(conv)
        return conv


class TransitionLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(TransitionLayer, self).__init__()

        num_in = len(in_channels)
        num_out = len(out_channels)
        out = []
        self.conv_bn_func_list = []
        for i in range(num_out):
            residual = None
            if i < num_in:
                if in_channels[i] != out_channels[i]:
                    residual = self.add_sublayer(
                        "transition_{}_layer_{}".format(name, i + 1),
                        ConvNormLayer(
                            ch_in=in_channels[i],
                            ch_out=out_channels[i],
                            filter_size=3,
                            norm_momentum=norm_momentum,
                            norm_decay=norm_decay,
                            freeze_norm=freeze_norm,
                            act='relu',
                            name=name + '_layer_' + str(i + 1)))
            else:
                residual = self.add_sublayer(
                    "transition_{}_layer_{}".format(name, i + 1),
                    ConvNormLayer(
                        ch_in=in_channels[-1],
                        ch_out=out_channels[i],
                        filter_size=3,
                        stride=2,
                        norm_momentum=norm_momentum,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        act='relu',
                        name=name + '_layer_' + str(i + 1)))
            self.conv_bn_func_list.append(residual)

    def forward(self, input):
        outs = []
        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
            if conv_bn_func is None:
                outs.append(input[idx])
            else:
                if idx < len(input):
                    outs.append(conv_bn_func(input[idx]))
                else:
                    outs.append(conv_bn_func(input[-1]))
        return outs


class Branches(nn.Layer):
    def __init__(self,
                 block_num,
                 in_channels,
                 out_channels,
                 has_se=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(Branches, self).__init__()

        self.basic_block_list = []
        for i in range(len(out_channels)):
            self.basic_block_list.append([])
            for j in range(block_num):
                in_ch = in_channels[i] if j == 0 else out_channels[i]
                basic_block_func = self.add_sublayer(
                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
                    BasicBlock(
                        num_channels=in_ch,
                        num_filters=out_channels[i],
                        has_se=has_se,
                        norm_momentum=norm_momentum,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        name=name + '_branch_layer_' + str(i + 1) + '_' +
                        str(j + 1)))
                self.basic_block_list[i].append(basic_block_func)

    def forward(self, inputs):
        outs = []
        for idx, input in enumerate(inputs):
            conv = input
            basic_block_list = self.basic_block_list[idx]
            for basic_block_func in basic_block_list:
                conv = basic_block_func(conv)
            outs.append(conv)
        return outs


class BottleneckBlock(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 has_se,
                 stride=1,
                 downsample=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(BottleneckBlock, self).__init__()

        self.has_se = has_se
        self.downsample = downsample

        self.conv1 = ConvNormLayer(
            ch_in=num_channels,
            ch_out=num_filters,
            filter_size=1,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            act="relu",
            name=name + "_conv1")
        self.conv2 = ConvNormLayer(
            ch_in=num_filters,
            ch_out=num_filters,
            filter_size=3,
            stride=stride,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            act="relu",
            name=name + "_conv2")
        self.conv3 = ConvNormLayer(
            ch_in=num_filters,
            ch_out=num_filters * 4,
            filter_size=1,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            act=None,
            name=name + "_conv3")

        if self.downsample:
            self.conv_down = ConvNormLayer(
                ch_in=num_channels,
                ch_out=num_filters * 4,
                filter_size=1,
                norm_momentum=norm_momentum,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                act=None,
                name=name + "_downsample")

        if self.has_se:
            self.se = SELayer(
                num_channels=num_filters * 4,
                num_filters=num_filters * 4,
                reduction_ratio=16,
                name='fc' + name)

    def forward(self, input):
        residual = input
        conv1 = self.conv1(input)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)

        if self.downsample:
            residual = self.conv_down(input)

        if self.has_se:
            conv3 = self.se(conv3)

        y = paddle.add(x=residual, y=conv3)
        y = F.relu(y)
        return y


class BasicBlock(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride=1,
                 has_se=False,
                 downsample=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(BasicBlock, self).__init__()

        self.has_se = has_se
        self.downsample = downsample
        self.conv1 = ConvNormLayer(
            ch_in=num_channels,
            ch_out=num_filters,
            filter_size=3,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            stride=stride,
            act="relu",
            name=name + "_conv1")
        self.conv2 = ConvNormLayer(
            ch_in=num_filters,
            ch_out=num_filters,
            filter_size=3,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            stride=1,
            act=None,
            name=name + "_conv2")

        if self.downsample:
            self.conv_down = ConvNormLayer(
                ch_in=num_channels,
                ch_out=num_filters * 4,
                filter_size=1,
                norm_momentum=norm_momentum,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                act=None,
                name=name + "_downsample")

        if self.has_se:
            self.se = SELayer(
                num_channels=num_filters,
                num_filters=num_filters,
                reduction_ratio=16,
                name='fc' + name)

    def forward(self, input):
        residual = input
        conv1 = self.conv1(input)
        conv2 = self.conv2(conv1)

        if self.downsample:
            residual = self.conv_down(input)

        if self.has_se:
            conv2 = self.se(conv2)

        y = paddle.add(x=residual, y=conv2)
        y = F.relu(y)
        return y


class SELayer(nn.Layer):
    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
        super(SELayer, self).__init__()

        self.pool2d_gap = AdaptiveAvgPool2D(1)

        self._num_channels = num_channels

        med_ch = int(num_channels / reduction_ratio)
        stdv = 1.0 / math.sqrt(num_channels * 1.0)
        self.squeeze = Linear(
            num_channels,
            med_ch,
            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))

        stdv = 1.0 / math.sqrt(med_ch * 1.0)
        self.excitation = Linear(
            med_ch,
            num_filters,
            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))

    def forward(self, input):
        pool = self.pool2d_gap(input)
        pool = paddle.squeeze(pool, axis=[2, 3])
        squeeze = self.squeeze(pool)
        squeeze = F.relu(squeeze)
        excitation = self.excitation(squeeze)
        excitation = F.sigmoid(excitation)
        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
        out = input * excitation
        return out


class Stage(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_modules,
                 num_filters,
                 has_se=False,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 multi_scale_output=True,
                 name=None):
        super(Stage, self).__init__()

        self._num_modules = num_modules
        self.stage_func_list = []
        for i in range(num_modules):
            if i == num_modules - 1 and not multi_scale_output:
                stage_func = self.add_sublayer(
                    "stage_{}_{}".format(name, i + 1),
                    HighResolutionModule(
                        num_channels=num_channels,
                        num_filters=num_filters,
                        has_se=has_se,
                        norm_momentum=norm_momentum,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        multi_scale_output=False,
                        name=name + '_' + str(i + 1)))
            else:
                stage_func = self.add_sublayer(
                    "stage_{}_{}".format(name, i + 1),
                    HighResolutionModule(
                        num_channels=num_channels,
                        num_filters=num_filters,
                        has_se=has_se,
                        norm_momentum=norm_momentum,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        name=name + '_' + str(i + 1)))

            self.stage_func_list.append(stage_func)

    def forward(self, input):
        out = input
        for idx in range(self._num_modules):
            out = self.stage_func_list[idx](out)
        return out


class HighResolutionModule(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 has_se=False,
                 multi_scale_output=True,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(HighResolutionModule, self).__init__()
        self.branches_func = Branches(
            block_num=4,
            in_channels=num_channels,
            out_channels=num_filters,
            has_se=has_se,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name)

        self.fuse_func = FuseLayers(
            in_channels=num_filters,
            out_channels=num_filters,
            multi_scale_output=multi_scale_output,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name)

    def forward(self, input):
        out = self.branches_func(input)
        out = self.fuse_func(out)
        return out


class FuseLayers(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 multi_scale_output=True,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 freeze_norm=True,
                 name=None):
        super(FuseLayers, self).__init__()

        self._actual_ch = len(in_channels) if multi_scale_output else 1
        self._in_channels = in_channels

        self.residual_func_list = []
        for i in range(self._actual_ch):
            for j in range(len(in_channels)):
                residual_func = None
                if j > i:
                    residual_func = self.add_sublayer(
                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
                        ConvNormLayer(
                            ch_in=in_channels[j],
                            ch_out=out_channels[i],
                            filter_size=1,
                            stride=1,
                            act=None,
                            norm_momentum=norm_momentum,
                            norm_decay=norm_decay,
                            freeze_norm=freeze_norm,
                            name=name + '_layer_' + str(i + 1) + '_' +
                            str(j + 1)))
                    self.residual_func_list.append(residual_func)
                elif j < i:
                    pre_num_filters = in_channels[j]
                    for k in range(i - j):
                        if k == i - j - 1:
                            residual_func = self.add_sublayer(
                                "residual_{}_layer_{}_{}_{}".format(
                                    name, i + 1, j + 1, k + 1),
                                ConvNormLayer(
                                    ch_in=pre_num_filters,
                                    ch_out=out_channels[i],
                                    filter_size=3,
                                    stride=2,
                                    norm_momentum=norm_momentum,
                                    norm_decay=norm_decay,
                                    freeze_norm=freeze_norm,
                                    act=None,
                                    name=name + '_layer_' + str(i + 1) + '_' +
                                    str(j + 1) + '_' + str(k + 1)))
                            pre_num_filters = out_channels[i]
                        else:
                            residual_func = self.add_sublayer(
                                "residual_{}_layer_{}_{}_{}".format(
                                    name, i + 1, j + 1, k + 1),
                                ConvNormLayer(
                                    ch_in=pre_num_filters,
                                    ch_out=out_channels[j],
                                    filter_size=3,
                                    stride=2,
                                    norm_momentum=norm_momentum,
                                    norm_decay=norm_decay,
                                    freeze_norm=freeze_norm,
                                    act="relu",
                                    name=name + '_layer_' + str(i + 1) + '_' +
                                    str(j + 1) + '_' + str(k + 1)))
                            pre_num_filters = out_channels[j]
                        self.residual_func_list.append(residual_func)

    def forward(self, input):
        outs = []
        residual_func_idx = 0
        for i in range(self._actual_ch):
            residual = input[i]
            for j in range(len(self._in_channels)):
                if j > i:
                    y = self.residual_func_list[residual_func_idx](input[j])
                    residual_func_idx += 1
                    y = F.interpolate(y, scale_factor=2**(j - i))
                    residual = paddle.add(x=residual, y=y)
                elif j < i:
                    y = input[j]
                    for k in range(i - j):
                        y = self.residual_func_list[residual_func_idx](y)
                        residual_func_idx += 1
                    residual = paddle.add(x=residual, y=y)
            residual = F.relu(residual)
            outs.append(residual)

        return outs


@register
class HRNet(nn.Layer):
    """
    HRNet, see https://arxiv.org/abs/1908.07919

    Args:
        width (int): the width of HRNet
        has_se (bool): whether to add SE block for each stage
        freeze_at (int): the stage to freeze
        freeze_norm (bool): whether to freeze norm in HRNet
        norm_momentum (float): momentum of BatchNorm
        norm_decay (float): weight decay for normalization layer weights
        return_idx (List): the stage to return
        upsample (bool): whether to upsample and concat the backbone feats
    """

    def __init__(self,
                 width=18,
                 has_se=False,
                 freeze_at=0,
                 freeze_norm=True,
                 norm_momentum=0.9,
                 norm_decay=0.,
                 return_idx=[0, 1, 2, 3],
                 upsample=False,
                 downsample=False):
        super(HRNet, self).__init__()

        self.width = width
        self.has_se = has_se
        if isinstance(return_idx, Integral):
            return_idx = [return_idx]

        assert len(return_idx) > 0, "need one or more return index"
        self.freeze_at = freeze_at
        self.return_idx = return_idx
        self.upsample = upsample
        self.downsample = downsample

        self.channels = {
            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
        }

        channels_2, channels_3, channels_4 = self.channels[width]
        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
        self._out_channels = [sum(channels_4)] if self.upsample else channels_4
        self._out_strides = [4] if self.upsample else [4, 8, 16, 32]

        self.conv_layer1_1 = ConvNormLayer(
            ch_in=3,
            ch_out=64,
            filter_size=3,
            stride=2,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            act='relu',
            name="layer1_1")

        self.conv_layer1_2 = ConvNormLayer(
            ch_in=64,
            ch_out=64,
            filter_size=3,
            stride=2,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            act='relu',
            name="layer1_2")

        self.la1 = Layer1(
            num_channels=64,
            has_se=has_se,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="layer2")

        self.tr1 = TransitionLayer(
            in_channels=[256],
            out_channels=channels_2,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="tr1")

        self.st2 = Stage(
            num_channels=channels_2,
            num_modules=num_modules_2,
            num_filters=channels_2,
            has_se=self.has_se,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="st2")

        self.tr2 = TransitionLayer(
            in_channels=channels_2,
            out_channels=channels_3,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="tr2")

        self.st3 = Stage(
            num_channels=channels_3,
            num_modules=num_modules_3,
            num_filters=channels_3,
            has_se=self.has_se,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="st3")

        self.tr3 = TransitionLayer(
            in_channels=channels_3,
            out_channels=channels_4,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="tr3")
        self.st4 = Stage(
            num_channels=channels_4,
            num_modules=num_modules_4,
            num_filters=channels_4,
            has_se=self.has_se,
            norm_momentum=norm_momentum,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            multi_scale_output=len(return_idx) > 1,
            name="st4")

        if self.downsample:
            self.incre_modules, self.downsamp_modules, \
                self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)

    def _make_layer(self,
                    block,
                    inplanes,
                    planes,
                    blocks,
                    stride=1,
                    norm_momentum=0.9,
                    has_se=False,
                    name=None):
        downsample = None
        if stride != 1 or inplanes != planes * 4:
            downsample = True

        layers = []
        layers.append(
            block(
                inplanes,
                planes,
                has_se,
                stride,
                downsample,
                norm_momentum=norm_momentum,
                freeze_norm=False,
                name=name + "_s0"))
        inplanes = planes * 4
        for i in range(1, blocks):
            layers.append(
                block(
                    inplanes,
                    planes,
                    has_se,
                    norm_momentum=norm_momentum,
                    freeze_norm=False,
                    name=name + "_s" + str(i)))

        return nn.Sequential(*layers)

    def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):
        head_block = BottleneckBlock
        head_channels = [32, 64, 128, 256]

        # Increasing the #channels on each resolution 
        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
        incre_modules = []
        for i, channels in enumerate(pre_stage_channels):
            incre_module = self._make_layer(
                head_block,
                channels,
                head_channels[i],
                1,
                stride=1,
                norm_momentum=norm_momentum,
                has_se=has_se,
                name='incre' + str(i))
            incre_modules.append(incre_module)
        incre_modules = nn.LayerList(incre_modules)

        # downsampling modules
        downsamp_modules = []
        for i in range(len(pre_stage_channels) - 1):
            in_channels = head_channels[i] * 4
            out_channels = head_channels[i + 1] * 4

            downsamp_module = nn.Sequential(
                nn.Conv2D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    stride=2,
                    padding=1),
                nn.BatchNorm2D(
                    out_channels, momentum=norm_momentum),
                nn.ReLU())

            downsamp_modules.append(downsamp_module)
        downsamp_modules = nn.LayerList(downsamp_modules)

        final_layer = nn.Sequential(
            nn.Conv2D(
                in_channels=head_channels[3] * 4,
                out_channels=2048,
                kernel_size=1,
                stride=1,
                padding=0),
            nn.BatchNorm2D(
                2048, momentum=norm_momentum),
            nn.ReLU())

        return incre_modules, downsamp_modules, final_layer

    def forward(self, inputs):
        x = inputs['image']
        conv1 = self.conv_layer1_1(x)
        conv2 = self.conv_layer1_2(conv1)

        la1 = self.la1(conv2)
        tr1 = self.tr1([la1])
        st2 = self.st2(tr1)
        tr2 = self.tr2(st2)

        st3 = self.st3(tr2)
        tr3 = self.tr3(st3)

        st4 = self.st4(tr3)

        if self.upsample:
            # Upsampling
            x0_h, x0_w = st4[0].shape[2:4]
            x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear')
            x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear')
            x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear')
            x = paddle.concat([st4[0], x1, x2, x3], 1)
            return x

        if self.downsample:
            y = self.incre_modules[0](st4[0])
            for i in range(len(self.downsamp_modules)):
                y = self.incre_modules[i+1](st4[i+1]) + \
                            self.downsamp_modules[i](y)
            y = self.final_layer(y)
            return y

        res = []
        for i, layer in enumerate(st4):
            if i == self.freeze_at:
                layer.stop_gradient = True
            if i in self.return_idx:
                res.append(layer)

        return res

    @property
    def out_shape(self):
        if self.upsample:
            self.return_idx = [0]
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]


================================================
FILE: ppdet/modeling/backbones/lcnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import Conv2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal

from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec

__all__ = ['LCNet']

NET_CONFIG = {
    "blocks2":
    #k, in_c, out_c, s, use_se
    [[3, 16, 32, 1, False], ],
    "blocks3": [
        [3, 32, 64, 2, False],
        [3, 64, 64, 1, False],
    ],
    "blocks4": [
        [3, 64, 128, 2, False],
        [3, 128, 128, 1, False],
    ],
    "blocks5": [
        [3, 128, 256, 2, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
    ],
    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}


def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 num_channels,
                 filter_size,
                 num_filters,
                 stride,
                 num_groups=1,
                 act='hard_swish'):
        super().__init__()

        self.conv = Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=num_groups,
            weight_attr=ParamAttr(initializer=KaimingNormal()),
            bias_attr=False)

        self.bn = nn.BatchNorm2D(
            num_filters,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        if act == 'hard_swish':
            self.act = nn.Hardswish()
        elif act == 'relu6':
            self.act = nn.ReLU6()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)
        return x


class DepthwiseSeparable(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride,
                 dw_size=3,
                 use_se=False,
                 act='hard_swish'):
        super().__init__()
        self.use_se = use_se
        self.dw_conv = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_channels,
            filter_size=dw_size,
            stride=stride,
            num_groups=num_channels,
            act=act)
        if use_se:
            self.se = SEModule(num_channels)
        self.pw_conv = ConvBNLayer(
            num_channels=num_channels,
            filter_size=1,
            num_filters=num_filters,
            stride=1,
            act=act)

    def forward(self, x):
        x = self.dw_conv(x)
        if self.use_se:
            x = self.se(x)
        x = self.pw_conv(x)
        return x

class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if paddle.device.get_device().startswith("npu"):
            self.device = "npu"
        else:
            self.device = None

        if isinstance(self._output_size, int) and self._output_size == 1:
            self._gap = True
        elif isinstance(self._output_size, tuple) and self._output_size[
                0] == 1 and self._output_size[1] == 1:
            self._gap = True
        else:
            self._gap = False

    def forward(self, x):
        if self.device == "npu" and self._gap:
            # Global Average Pooling
            N, C, _, _ = x.shape
            x_mean = paddle.mean(x, axis=[2, 3])
            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
            return x_mean
        else:
            return super(AdaptiveAvgPool2D, self).forward(x)

class SEModule(nn.Layer):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.avg_pool = AdaptiveAvgPool2D(1)
        self.conv1 = Conv2D(
            in_channels=channel,
            out_channels=channel // reduction,
            kernel_size=1,
            stride=1,
            padding=0)
        self.relu = nn.ReLU()
        self.conv2 = Conv2D(
            in_channels=channel // reduction,
            out_channels=channel,
            kernel_size=1,
            stride=1,
            padding=0)
        self.hardsigmoid = nn.Hardsigmoid()

    def forward(self, x):
        identity = x
        x = self.avg_pool(x)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.hardsigmoid(x)
        x = paddle.multiply(x=identity, y=x)
        return x


@register
@serializable
class LCNet(nn.Layer):
    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
        super().__init__()
        self.scale = scale
        self.feature_maps = feature_maps

        out_channels = []

        self.conv1 = ConvBNLayer(
            num_channels=3,
            filter_size=3,
            num_filters=make_divisible(16 * scale),
            stride=2,
            act=act)

        self.blocks2 = nn.Sequential(* [
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se,
                act=act)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
        ])

        self.blocks3 = nn.Sequential(* [
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se,
                act=act)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
        ])

        out_channels.append(
            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))

        self.blocks4 = nn.Sequential(* [
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se,
                act=act)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
        ])

        out_channels.append(
            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))

        self.blocks5 = nn.Sequential(* [
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se,
                act=act)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
        ])

        out_channels.append(
            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))

        self.blocks6 = nn.Sequential(* [
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se,
                act=act)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
        ])

        out_channels.append(
            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
        self._out_channels = [
            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
        ]

    def forward(self, inputs):
        x = inputs['image']
        outs = []

        x = self.conv1(x)
        x = self.blocks2(x)
        x = self.blocks3(x)
        outs.append(x)
        x = self.blocks4(x)
        outs.append(x)
        x = self.blocks5(x)
        outs.append(x)
        x = self.blocks6(x)
        outs.append(x)
        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/lite_hrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on
https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py
"""

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from numbers import Integral
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
from .. import layers as L

__all__ = ['LiteHRNet']


class ConvNormLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 stride=1,
                 groups=1,
                 norm_type=None,
                 norm_groups=32,
                 norm_decay=0.,
                 freeze_norm=False,
                 act=None):
        super(ConvNormLayer, self).__init__()
        self.act = act
        norm_lr = 0. if freeze_norm else 1.
        if norm_type is not None:
            assert norm_type in ['bn', 'sync_bn', 'gn'], \
                "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type)
            param_attr = ParamAttr(
                initializer=Constant(1.0),
                learning_rate=norm_lr,
                regularizer=L2Decay(norm_decay), )
            bias_attr = ParamAttr(
                learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
            global_stats = True if freeze_norm else None
            if norm_type in ['bn', 'sync_bn']:
                self.norm = nn.BatchNorm2D(
                    ch_out,
                    weight_attr=param_attr,
                    bias_attr=bias_attr,
                    use_global_stats=global_stats, )
            elif norm_type == 'gn':
                self.norm = nn.GroupNorm(
                    num_groups=norm_groups,
                    num_channels=ch_out,
                    weight_attr=param_attr,
                    bias_attr=bias_attr)
            norm_params = self.norm.parameters()
            if freeze_norm:
                for param in norm_params:
                    param.stop_gradient = True
            conv_bias_attr = False
        else:
            conv_bias_attr = True
            self.norm = None

        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0., std=0.001)),
            bias_attr=conv_bias_attr)

    def forward(self, inputs):
        out = self.conv(inputs)
        if self.norm is not None:
            out = self.norm(out)

        if self.act == 'relu':
            out = F.relu(out)
        elif self.act == 'sigmoid':
            out = F.sigmoid(out)
        return out


class DepthWiseSeparableConvNormLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 stride=1,
                 dw_norm_type=None,
                 pw_norm_type=None,
                 norm_decay=0.,
                 freeze_norm=False,
                 dw_act=None,
                 pw_act=None):
        super(DepthWiseSeparableConvNormLayer, self).__init__()
        self.depthwise_conv = ConvNormLayer(
            ch_in=ch_in,
            ch_out=ch_in,
            filter_size=filter_size,
            stride=stride,
            groups=ch_in,
            norm_type=dw_norm_type,
            act=dw_act,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm, )
        self.pointwise_conv = ConvNormLayer(
            ch_in=ch_in,
            ch_out=ch_out,
            filter_size=1,
            stride=1,
            norm_type=pw_norm_type,
            act=pw_act,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm, )

    def forward(self, x):
        x = self.depthwise_conv(x)
        x = self.pointwise_conv(x)
        return x


class CrossResolutionWeightingModule(nn.Layer):
    def __init__(self,
                 channels,
                 ratio=16,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(CrossResolutionWeightingModule, self).__init__()
        self.channels = channels
        total_channel = sum(channels)
        self.conv1 = ConvNormLayer(
            ch_in=total_channel,
            ch_out=total_channel // ratio,
            filter_size=1,
            stride=1,
            norm_type=norm_type,
            act='relu',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        self.conv2 = ConvNormLayer(
            ch_in=total_channel // ratio,
            ch_out=total_channel,
            filter_size=1,
            stride=1,
            norm_type=norm_type,
            act='sigmoid',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)

    def forward(self, x):
        mini_size = x[-1].shape[-2:]
        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
        out = paddle.concat(out, 1)
        out = self.conv1(out)
        out = self.conv2(out)
        out = paddle.split(out, self.channels, 1)
        out = [
            s * F.interpolate(
                a, s.shape[-2:], mode='nearest') for s, a in zip(x, out)
        ]
        return out


class SpatialWeightingModule(nn.Layer):
    def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):
        super(SpatialWeightingModule, self).__init__()
        self.global_avgpooling = nn.AdaptiveAvgPool2D(1)
        self.conv1 = ConvNormLayer(
            ch_in=in_channel,
            ch_out=in_channel // ratio,
            filter_size=1,
            stride=1,
            act='relu',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        self.conv2 = ConvNormLayer(
            ch_in=in_channel // ratio,
            ch_out=in_channel,
            filter_size=1,
            stride=1,
            act='sigmoid',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)

    def forward(self, x):
        out = self.global_avgpooling(x)
        out = self.conv1(out)
        out = self.conv2(out)
        return x * out


class ConditionalChannelWeightingBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 stride,
                 reduce_ratio,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(ConditionalChannelWeightingBlock, self).__init__()
        assert stride in [1, 2]
        branch_channels = [channel // 2 for channel in in_channels]

        self.cross_resolution_weighting = CrossResolutionWeightingModule(
            branch_channels,
            ratio=reduce_ratio,
            norm_type=norm_type,
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        self.depthwise_convs = nn.LayerList([
            ConvNormLayer(
                channel,
                channel,
                filter_size=3,
                stride=stride,
                groups=channel,
                norm_type=norm_type,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay) for channel in branch_channels
        ])

        self.spatial_weighting = nn.LayerList([
            SpatialWeightingModule(
                channel,
                ratio=4,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay) for channel in branch_channels
        ])

    def forward(self, x):
        x = [s.chunk(2, axis=1) for s in x]
        x1 = [s[0] for s in x]
        x2 = [s[1] for s in x]

        x2 = self.cross_resolution_weighting(x2)
        x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
        x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]

        out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]
        out = [channel_shuffle(s, groups=2) for s in out]
        return out


class ShuffleUnit(nn.Layer):
    def __init__(self,
                 in_channel,
                 out_channel,
                 stride,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(ShuffleUnit, self).__init__()
        branch_channel = out_channel // 2
        self.stride = stride
        if self.stride == 1:
            assert in_channel == branch_channel * 2, \
                "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2)
        if stride > 1:
            self.branch1 = nn.Sequential(
                ConvNormLayer(
                    ch_in=in_channel,
                    ch_out=in_channel,
                    filter_size=3,
                    stride=self.stride,
                    groups=in_channel,
                    norm_type=norm_type,
                    freeze_norm=freeze_norm,
                    norm_decay=norm_decay),
                ConvNormLayer(
                    ch_in=in_channel,
                    ch_out=branch_channel,
                    filter_size=1,
                    stride=1,
                    norm_type=norm_type,
                    act='relu',
                    freeze_norm=freeze_norm,
                    norm_decay=norm_decay), )
        self.branch2 = nn.Sequential(
            ConvNormLayer(
                ch_in=branch_channel if stride == 1 else in_channel,
                ch_out=branch_channel,
                filter_size=1,
                stride=1,
                norm_type=norm_type,
                act='relu',
                freeze_norm=freeze_norm,
                norm_decay=norm_decay),
            ConvNormLayer(
                ch_in=branch_channel,
                ch_out=branch_channel,
                filter_size=3,
                stride=self.stride,
                groups=branch_channel,
                norm_type=norm_type,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay),
            ConvNormLayer(
                ch_in=branch_channel,
                ch_out=branch_channel,
                filter_size=1,
                stride=1,
                norm_type=norm_type,
                act='relu',
                freeze_norm=freeze_norm,
                norm_decay=norm_decay), )

    def forward(self, x):
        if self.stride > 1:
            x1 = self.branch1(x)
            x2 = self.branch2(x)
        else:
            x1, x2 = x.chunk(2, axis=1)
            x2 = self.branch2(x2)
        out = paddle.concat([x1, x2], axis=1)
        out = channel_shuffle(out, groups=2)
        return out


class IterativeHead(nn.Layer):
    def __init__(self,
                 in_channels,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(IterativeHead, self).__init__()
        num_branches = len(in_channels)
        self.in_channels = in_channels[::-1]

        projects = []
        for i in range(num_branches):
            if i != num_branches - 1:
                projects.append(
                    DepthWiseSeparableConvNormLayer(
                        ch_in=self.in_channels[i],
                        ch_out=self.in_channels[i + 1],
                        filter_size=3,
                        stride=1,
                        dw_act=None,
                        pw_act='relu',
                        dw_norm_type=norm_type,
                        pw_norm_type=norm_type,
                        freeze_norm=freeze_norm,
                        norm_decay=norm_decay))
            else:
                projects.append(
                    DepthWiseSeparableConvNormLayer(
                        ch_in=self.in_channels[i],
                        ch_out=self.in_channels[i],
                        filter_size=3,
                        stride=1,
                        dw_act=None,
                        pw_act='relu',
                        dw_norm_type=norm_type,
                        pw_norm_type=norm_type,
                        freeze_norm=freeze_norm,
                        norm_decay=norm_decay))
        self.projects = nn.LayerList(projects)

    def forward(self, x):
        x = x[::-1]
        y = []
        last_x = None
        for i, s in enumerate(x):
            if last_x is not None:
                last_x = F.interpolate(
                    last_x,
                    size=s.shape[-2:],
                    mode='bilinear',
                    align_corners=True)
                s = s + last_x
            s = self.projects[i](s)
            y.append(s)
            last_x = s

        return y[::-1]


class Stem(nn.Layer):
    def __init__(self,
                 in_channel,
                 stem_channel,
                 out_channel,
                 expand_ratio,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(Stem, self).__init__()
        self.conv1 = ConvNormLayer(
            in_channel,
            stem_channel,
            filter_size=3,
            stride=2,
            norm_type=norm_type,
            act='relu',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        mid_channel = int(round(stem_channel * expand_ratio))
        branch_channel = stem_channel // 2
        if stem_channel == out_channel:
            inc_channel = out_channel - branch_channel
        else:
            inc_channel = out_channel - stem_channel
        self.branch1 = nn.Sequential(
            ConvNormLayer(
                ch_in=branch_channel,
                ch_out=branch_channel,
                filter_size=3,
                stride=2,
                groups=branch_channel,
                norm_type=norm_type,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay),
            ConvNormLayer(
                ch_in=branch_channel,
                ch_out=inc_channel,
                filter_size=1,
                stride=1,
                norm_type=norm_type,
                act='relu',
                freeze_norm=freeze_norm,
                norm_decay=norm_decay), )
        self.expand_conv = ConvNormLayer(
            ch_in=branch_channel,
            ch_out=mid_channel,
            filter_size=1,
            stride=1,
            norm_type=norm_type,
            act='relu',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        self.depthwise_conv = ConvNormLayer(
            ch_in=mid_channel,
            ch_out=mid_channel,
            filter_size=3,
            stride=2,
            groups=mid_channel,
            norm_type=norm_type,
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)
        self.linear_conv = ConvNormLayer(
            ch_in=mid_channel,
            ch_out=branch_channel
            if stem_channel == out_channel else stem_channel,
            filter_size=1,
            stride=1,
            norm_type=norm_type,
            act='relu',
            freeze_norm=freeze_norm,
            norm_decay=norm_decay)

    def forward(self, x):
        x = self.conv1(x)
        x1, x2 = x.chunk(2, axis=1)
        x1 = self.branch1(x1)
        x2 = self.expand_conv(x2)
        x2 = self.depthwise_conv(x2)
        x2 = self.linear_conv(x2)
        out = paddle.concat([x1, x2], axis=1)
        out = channel_shuffle(out, groups=2)

        return out


class LiteHRNetModule(nn.Layer):
    def __init__(self,
                 num_branches,
                 num_blocks,
                 in_channels,
                 reduce_ratio,
                 module_type,
                 multiscale_output=False,
                 with_fuse=True,
                 norm_type='bn',
                 freeze_norm=False,
                 norm_decay=0.):
        super(LiteHRNetModule, self).__init__()
        assert num_branches == len(in_channels),\
            "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels))
        assert module_type in [
            'LITE', 'NAIVE'
        ], "module_type should be one of ['LITE', 'NAIVE']"
        self.num_branches = num_branches
        self.in_channels = in_channels
        self.multiscale_output = multiscale_output
        self.with_fuse = with_fuse
        self.norm_type = 'bn'
        self.module_type = module_type

        if self.module_type == 'LITE':
            self.layers = self._make_weighting_blocks(
                num_blocks,
                reduce_ratio,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay)
        elif self.module_type == 'NAIVE':
            self.layers = self._make_naive_branches(
                num_branches,
                num_blocks,
                freeze_norm=freeze_norm,
                norm_decay=norm_decay)

        if self.with_fuse:
            self.fuse_layers = self._make_fuse_layers(
                freeze_norm=freeze_norm, norm_decay=norm_decay)
            self.relu = nn.ReLU()

    def _make_weighting_blocks(self,
                               num_blocks,
                               reduce_ratio,
                               stride=1,
                               freeze_norm=False,
                               norm_decay=0.):
        layers = []
        for i in range(num_blocks):
            layers.append(
                ConditionalChannelWeightingBlock(
                    self.in_channels,
                    stride=stride,
                    reduce_ratio=reduce_ratio,
                    norm_type=self.norm_type,
                    freeze_norm=freeze_norm,
                    norm_decay=norm_decay))
        return nn.Sequential(*layers)

    def _make_naive_branches(self,
                             num_branches,
                             num_blocks,
                             freeze_norm=False,
                             norm_decay=0.):
        branches = []
        for branch_idx in range(num_branches):
            layers = []
            for i in range(num_blocks):
                layers.append(
                    ShuffleUnit(
                        self.in_channels[branch_idx],
                        self.in_channels[branch_idx],
                        stride=1,
                        norm_type=self.norm_type,
                        freeze_norm=freeze_norm,
                        norm_decay=norm_decay))
            branches.append(nn.Sequential(*layers))
        return nn.LayerList(branches)

    def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):
        if self.num_branches == 1:
            return None
        fuse_layers = []
        num_out_branches = self.num_branches if self.multiscale_output else 1
        for i in range(num_out_branches):
            fuse_layer = []
            for j in range(self.num_branches):
                if j > i:
                    fuse_layer.append(
                        nn.Sequential(
                            L.Conv2d(
                                self.in_channels[j],
                                self.in_channels[i],
                                kernel_size=1,
                                stride=1,
                                padding=0,
                                bias=False, ),
                            nn.BatchNorm2D(self.in_channels[i]),
                            nn.Upsample(
                                scale_factor=2**(j - i), mode='nearest')))
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv_downsamples = []
                    for k in range(i - j):
                        if k == i - j - 1:
                            conv_downsamples.append(
                                nn.Sequential(
                                    L.Conv2d(
                                        self.in_channels[j],
                                        self.in_channels[j],
                                        kernel_size=3,
                                        stride=2,
                                        padding=1,
                                        groups=self.in_channels[j],
                                        bias=False, ),
                                    nn.BatchNorm2D(self.in_channels[j]),
                                    L.Conv2d(
                                        self.in_channels[j],
                                        self.in_channels[i],
                                        kernel_size=1,
                                        stride=1,
                                        padding=0,
                                        bias=False, ),
                                    nn.BatchNorm2D(self.in_channels[i])))
                        else:
                            conv_downsamples.append(
                                nn.Sequential(
                                    L.Conv2d(
                                        self.in_channels[j],
                                        self.in_channels[j],
                                        kernel_size=3,
                                        stride=2,
                                        padding=1,
                                        groups=self.in_channels[j],
                                        bias=False, ),
                                    nn.BatchNorm2D(self.in_channels[j]),
                                    L.Conv2d(
                                        self.in_channels[j],
                                        self.in_channels[j],
                                        kernel_size=1,
                                        stride=1,
                                        padding=0,
                                        bias=False, ),
                                    nn.BatchNorm2D(self.in_channels[j]),
                                    nn.ReLU()))

                    fuse_layer.append(nn.Sequential(*conv_downsamples))
            fuse_layers.append(nn.LayerList(fuse_layer))

        return nn.LayerList(fuse_layers)

    def forward(self, x):
        if self.num_branches == 1:
            return [self.layers[0](x[0])]
        if self.module_type == 'LITE':
            out = self.layers(x)
        elif self.module_type == 'NAIVE':
            for i in range(self.num_branches):
                x[i] = self.layers[i](x[i])
            out = x
        if self.with_fuse:
            out_fuse = []
            for i in range(len(self.fuse_layers)):
                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
                for j in range(self.num_branches):
                    if j == 0:
                        y += y
                    elif i == j:
                        y += out[j]
                    else:
                        y += self.fuse_layers[i][j](out[j])
                    if i == 0:
                        out[i] = y
                out_fuse.append(self.relu(y))
            out = out_fuse
        elif not self.multiscale_output:
            out = [out[0]]
        return out


@register
class LiteHRNet(nn.Layer):
    """
    @inproceedings{Yulitehrnet21,
    title={Lite-HRNet: A Lightweight High-Resolution Network},
        author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
        booktitle={CVPR},year={2021}
    }
    Args:
        network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"],
            "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.
            "wider_naive": Naive network with wider channels in each block.
            "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.
            "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.
        freeze_at (int): the stage to freeze
        freeze_norm (bool): whether to freeze norm in HRNet
        norm_decay (float): weight decay for normalization layer weights
        return_idx (List): the stage to return
    """

    def __init__(self,
                 network_type,
                 freeze_at=0,
                 freeze_norm=True,
                 norm_decay=0.,
                 return_idx=[0, 1, 2, 3]):
        super(LiteHRNet, self).__init__()
        if isinstance(return_idx, Integral):
            return_idx = [return_idx]
        assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \
            "the network_type should be one of [lite_18, lite_30, naive, wider_naive]"
        assert len(return_idx) > 0, "need one or more return index"
        self.freeze_at = freeze_at
        self.freeze_norm = freeze_norm
        self.norm_decay = norm_decay
        self.return_idx = return_idx
        self.norm_type = 'bn'

        self.module_configs = {
            "lite_18": {
                "num_modules": [2, 4, 2],
                "num_branches": [2, 3, 4],
                "num_blocks": [2, 2, 2],
                "module_type": ["LITE", "LITE", "LITE"],
                "reduce_ratios": [8, 8, 8],
                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
            },
            "lite_30": {
                "num_modules": [3, 8, 3],
                "num_branches": [2, 3, 4],
                "num_blocks": [2, 2, 2],
                "module_type": ["LITE", "LITE", "LITE"],
                "reduce_ratios": [8, 8, 8],
                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
            },
            "naive": {
                "num_modules": [2, 4, 2],
                "num_branches": [2, 3, 4],
                "num_blocks": [2, 2, 2],
                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
                "reduce_ratios": [1, 1, 1],
                "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
            },
            "wider_naive": {
                "num_modules": [2, 4, 2],
                "num_branches": [2, 3, 4],
                "num_blocks": [2, 2, 2],
                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
                "reduce_ratios": [1, 1, 1],
                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
            },
        }

        self.stages_config = self.module_configs[network_type]

        self.stem = Stem(3, 32, 32, 1)
        num_channels_pre_layer = [32]
        for stage_idx in range(3):
            num_channels = self.stages_config["num_channels"][stage_idx]
            setattr(self, 'transition{}'.format(stage_idx),
                    self._make_transition_layer(num_channels_pre_layer,
                                                num_channels, self.freeze_norm,
                                                self.norm_decay))
            stage, num_channels_pre_layer = self._make_stage(
                self.stages_config, stage_idx, num_channels, True,
                self.freeze_norm, self.norm_decay)
            setattr(self, 'stage{}'.format(stage_idx), stage)
        self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',
                                        self.freeze_norm, self.norm_decay)

    def _make_transition_layer(self,
                               num_channels_pre_layer,
                               num_channels_cur_layer,
                               freeze_norm=False,
                               norm_decay=0.):
        num_branches_pre = len(num_channels_pre_layer)
        num_branches_cur = len(num_channels_cur_layer)
        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            L.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_pre_layer[i],
                                kernel_size=3,
                                stride=1,
                                padding=1,
                                groups=num_channels_pre_layer[i],
                                bias=False),
                            nn.BatchNorm2D(num_channels_pre_layer[i]),
                            L.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_cur_layer[i],
                                kernel_size=1,
                                stride=1,
                                padding=0,
                                bias=False, ),
                            nn.BatchNorm2D(num_channels_cur_layer[i]),
                            nn.ReLU()))
                else:
                    transition_layers.append(None)
            else:
                conv_downsamples = []
                for j in range(i + 1 - num_branches_pre):
                    conv_downsamples.append(
                        nn.Sequential(
                            L.Conv2d(
                                num_channels_pre_layer[-1],
                                num_channels_pre_layer[-1],
                                groups=num_channels_pre_layer[-1],
                                kernel_size=3,
                                stride=2,
                                padding=1,
                                bias=False, ),
                            nn.BatchNorm2D(num_channels_pre_layer[-1]),
                            L.Conv2d(
                                num_channels_pre_layer[-1],
                                num_channels_cur_layer[i]
                                if j == i - num_branches_pre else
                                num_channels_pre_layer[-1],
                                kernel_size=1,
                                stride=1,
                                padding=0,
                                bias=False, ),
                            nn.BatchNorm2D(num_channels_cur_layer[i]
                                           if j == i - num_branches_pre else
                                           num_channels_pre_layer[-1]),
                            nn.ReLU()))
                transition_layers.append(nn.Sequential(*conv_downsamples))
        return nn.LayerList(transition_layers)

    def _make_stage(self,
                    stages_config,
                    stage_idx,
                    in_channels,
                    multiscale_output,
                    freeze_norm=False,
                    norm_decay=0.):
        num_modules = stages_config["num_modules"][stage_idx]
        num_branches = stages_config["num_branches"][stage_idx]
        num_blocks = stages_config["num_blocks"][stage_idx]
        reduce_ratio = stages_config['reduce_ratios'][stage_idx]
        module_type = stages_config['module_type'][stage_idx]

        modules = []
        for i in range(num_modules):
            if not multiscale_output and i == num_modules - 1:
                reset_multiscale_output = False
            else:
                reset_multiscale_output = True
            modules.append(
                LiteHRNetModule(
                    num_branches,
                    num_blocks,
                    in_channels,
                    reduce_ratio,
                    module_type,
                    multiscale_output=reset_multiscale_output,
                    with_fuse=True,
                    freeze_norm=freeze_norm,
                    norm_decay=norm_decay))
            in_channels = modules[-1].in_channels
        return nn.Sequential(*modules), in_channels

    def forward(self, inputs):
        x = inputs['image']
        dims = x.shape
        if len(dims) == 5:
            x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],
                                   dims[4]))  # [6, 3, 128, 96]

        x = self.stem(x)
        y_list = [x]
        for stage_idx in range(3):
            x_list = []
            transition = getattr(self, 'transition{}'.format(stage_idx))
            for j in range(self.stages_config["num_branches"][stage_idx]):
                if transition[j] is not None:
                    if j >= len(y_list):
                        x_list.append(transition[j](y_list[-1]))
                    else:
                        x_list.append(transition[j](y_list[j]))
                else:
                    x_list.append(y_list[j])
            y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)
        x = self.head_layer(y_list)
        res = []
        for i, layer in enumerate(x):
            if i == self.freeze_at:
                layer.stop_gradient = True
            if i in self.return_idx:
                res.append(layer)
        return res

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]


================================================
FILE: ppdet/modeling/backbones/mobilenet_v1.py
================================================
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec

__all__ = ['MobileNet']


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 num_groups=1,
                 act='relu',
                 conv_lr=1.,
                 conv_decay=0.,
                 norm_decay=0.,
                 norm_type='bn',
                 name=None):
        super(ConvBNLayer, self).__init__()
        self.act = act
        self._conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=num_groups,
            weight_attr=ParamAttr(
                learning_rate=conv_lr,
                initializer=KaimingNormal(),
                regularizer=L2Decay(conv_decay)),
            bias_attr=False)

        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
        if norm_type in ['sync_bn', 'bn']:
            self._batch_norm = nn.BatchNorm2D(
                out_channels, weight_attr=param_attr, bias_attr=bias_attr)

    def forward(self, x):
        x = self._conv(x)
        x = self._batch_norm(x)
        if self.act == "relu":
            x = F.relu(x)
        elif self.act == "relu6":
            x = F.relu6(x)
        return x


class DepthwiseSeparable(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels1,
                 out_channels2,
                 num_groups,
                 stride,
                 scale,
                 conv_lr=1.,
                 conv_decay=0.,
                 norm_decay=0.,
                 norm_type='bn',
                 name=None):
        super(DepthwiseSeparable, self).__init__()

        self._depthwise_conv = ConvBNLayer(
            in_channels,
            int(out_channels1 * scale),
            kernel_size=3,
            stride=stride,
            padding=1,
            num_groups=int(num_groups * scale),
            conv_lr=conv_lr,
            conv_decay=conv_decay,
            norm_decay=norm_decay,
            norm_type=norm_type,
            name=name + "_dw")

        self._pointwise_conv = ConvBNLayer(
            int(out_channels1 * scale),
            int(out_channels2 * scale),
            kernel_size=1,
            stride=1,
            padding=0,
            conv_lr=conv_lr,
            conv_decay=conv_decay,
            norm_decay=norm_decay,
            norm_type=norm_type,
            name=name + "_sep")

    def forward(self, x):
        x = self._depthwise_conv(x)
        x = self._pointwise_conv(x)
        return x


class ExtraBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels1,
                 out_channels2,
                 num_groups=1,
                 stride=2,
                 conv_lr=1.,
                 conv_decay=0.,
                 norm_decay=0.,
                 norm_type='bn',
                 name=None):
        super(ExtraBlock, self).__init__()

        self.pointwise_conv = ConvBNLayer(
            in_channels,
            int(out_channels1),
            kernel_size=1,
            stride=1,
            padding=0,
            num_groups=int(num_groups),
            act='relu6',
            conv_lr=conv_lr,
            conv_decay=conv_decay,
            norm_decay=norm_decay,
            norm_type=norm_type,
            name=name + "_extra1")

        self.normal_conv = ConvBNLayer(
            int(out_channels1),
            int(out_channels2),
            kernel_size=3,
            stride=stride,
            padding=1,
            num_groups=int(num_groups),
            act='relu6',
            conv_lr=conv_lr,
            conv_decay=conv_decay,
            norm_decay=norm_decay,
            norm_type=norm_type,
            name=name + "_extra2")

    def forward(self, x):
        x = self.pointwise_conv(x)
        x = self.normal_conv(x)
        return x


@register
@serializable
class MobileNet(nn.Layer):
    __shared__ = ['norm_type']

    def __init__(self,
                 norm_type='bn',
                 norm_decay=0.,
                 conv_decay=0.,
                 scale=1,
                 conv_learning_rate=1.0,
                 feature_maps=[4, 6, 13],
                 with_extra_blocks=False,
                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
                                      [64, 128]]):
        super(MobileNet, self).__init__()
        if isinstance(feature_maps, Integral):
            feature_maps = [feature_maps]
        self.feature_maps = feature_maps
        self.with_extra_blocks = with_extra_blocks
        self.extra_block_filters = extra_block_filters

        self._out_channels = []

        self.conv1 = ConvBNLayer(
            in_channels=3,
            out_channels=int(32 * scale),
            kernel_size=3,
            stride=2,
            padding=1,
            conv_lr=conv_learning_rate,
            conv_decay=conv_decay,
            norm_decay=norm_decay,
            norm_type=norm_type,
            name="conv1")

        self.dwsl = []
        dws21 = self.add_sublayer(
            "conv2_1",
            sublayer=DepthwiseSeparable(
                in_channels=int(32 * scale),
                out_channels1=32,
                out_channels2=64,
                num_groups=32,
                stride=1,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv2_1"))
        self.dwsl.append(dws21)
        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
        dws22 = self.add_sublayer(
            "conv2_2",
            sublayer=DepthwiseSeparable(
                in_channels=int(64 * scale),
                out_channels1=64,
                out_channels2=128,
                num_groups=64,
                stride=2,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv2_2"))
        self.dwsl.append(dws22)
        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
        # 1/4
        dws31 = self.add_sublayer(
            "conv3_1",
            sublayer=DepthwiseSeparable(
                in_channels=int(128 * scale),
                out_channels1=128,
                out_channels2=128,
                num_groups=128,
                stride=1,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv3_1"))
        self.dwsl.append(dws31)
        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
        dws32 = self.add_sublayer(
            "conv3_2",
            sublayer=DepthwiseSeparable(
                in_channels=int(128 * scale),
                out_channels1=128,
                out_channels2=256,
                num_groups=128,
                stride=2,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv3_2"))
        self.dwsl.append(dws32)
        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
        # 1/8
        dws41 = self.add_sublayer(
            "conv4_1",
            sublayer=DepthwiseSeparable(
                in_channels=int(256 * scale),
                out_channels1=256,
                out_channels2=256,
                num_groups=256,
                stride=1,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv4_1"))
        self.dwsl.append(dws41)
        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
        dws42 = self.add_sublayer(
            "conv4_2",
            sublayer=DepthwiseSeparable(
                in_channels=int(256 * scale),
                out_channels1=256,
                out_channels2=512,
                num_groups=256,
                stride=2,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv4_2"))
        self.dwsl.append(dws42)
        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
        # 1/16
        for i in range(5):
            tmp = self.add_sublayer(
                "conv5_" + str(i + 1),
                sublayer=DepthwiseSeparable(
                    in_channels=int(512 * scale),
                    out_channels1=512,
                    out_channels2=512,
                    num_groups=512,
                    stride=1,
                    scale=scale,
                    conv_lr=conv_learning_rate,
                    conv_decay=conv_decay,
                    norm_decay=norm_decay,
                    norm_type=norm_type,
                    name="conv5_" + str(i + 1)))
            self.dwsl.append(tmp)
            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
        dws56 = self.add_sublayer(
            "conv5_6",
            sublayer=DepthwiseSeparable(
                in_channels=int(512 * scale),
                out_channels1=512,
                out_channels2=1024,
                num_groups=512,
                stride=2,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv5_6"))
        self.dwsl.append(dws56)
        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
        # 1/32
        dws6 = self.add_sublayer(
            "conv6",
            sublayer=DepthwiseSeparable(
                in_channels=int(1024 * scale),
                out_channels1=1024,
                out_channels2=1024,
                num_groups=1024,
                stride=1,
                scale=scale,
                conv_lr=conv_learning_rate,
                conv_decay=conv_decay,
                norm_decay=norm_decay,
                norm_type=norm_type,
                name="conv6"))
        self.dwsl.append(dws6)
        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)

        if self.with_extra_blocks:
            self.extra_blocks = []
            for i, block_filter in enumerate(self.extra_block_filters):
                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
                conv_extra = self.add_sublayer(
                    "conv7_" + str(i + 1),
                    sublayer=ExtraBlock(
                        in_c,
                        block_filter[0],
                        block_filter[1],
                        conv_lr=conv_learning_rate,
                        conv_decay=conv_decay,
                        norm_decay=norm_decay,
                        norm_type=norm_type,
                        name="conv7_" + str(i + 1)))
                self.extra_blocks.append(conv_extra)
                self._update_out_channels(
                    block_filter[1],
                    len(self.dwsl) + len(self.extra_blocks), feature_maps)

    def _update_out_channels(self, channel, feature_idx, feature_maps):
        if feature_idx in feature_maps:
            self._out_channels.append(channel)

    def forward(self, inputs):
        outs = []
        y = self.conv1(inputs['image'])
        for i, block in enumerate(self.dwsl):
            y = block(y)
            if i + 1 in self.feature_maps:
                outs.append(y)

        if not self.with_extra_blocks:
            return outs

        y = outs[-1]
        for i, block in enumerate(self.extra_blocks):
            idx = i + len(self.dwsl)
            y = block(y)
            if idx + 1 in self.feature_maps:
                outs.append(y)
        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/mobilenet_v3.py
================================================
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec

__all__ = ['MobileNetV3']


def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_c,
                 out_c,
                 filter_size,
                 stride,
                 padding,
                 num_groups=1,
                 act=None,
                 lr_mult=1.,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 name=""):
        super(ConvBNLayer, self).__init__()
        self.act = act
        self.conv = nn.Conv2D(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
            groups=num_groups,
            weight_attr=ParamAttr(
                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
            bias_attr=False)

        norm_lr = 0. if freeze_norm else lr_mult
        param_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            trainable=False if freeze_norm else True)
        bias_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            trainable=False if freeze_norm else True)
        global_stats = True if freeze_norm else None
        if norm_type in ['sync_bn', 'bn']:
            self.bn = nn.BatchNorm2D(
                out_c,
                weight_attr=param_attr,
                bias_attr=bias_attr,
                use_global_stats=global_stats)
        norm_params = self.bn.parameters()
        if freeze_norm:
            for param in norm_params:
                param.stop_gradient = True

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.act is not None:
            if self.act == "relu":
                x = F.relu(x)
            elif self.act == "relu6":
                x = F.relu6(x)
            elif self.act == "hard_swish":
                x = F.hardswish(x)
            else:
                raise NotImplementedError(
                    "The activation function is selected incorrectly.")
        return x


class ResidualUnit(nn.Layer):
    def __init__(self,
                 in_c,
                 mid_c,
                 out_c,
                 filter_size,
                 stride,
                 use_se,
                 lr_mult,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 act=None,
                 return_list=False,
                 name=''):
        super(ResidualUnit, self).__init__()
        self.if_shortcut = stride == 1 and in_c == out_c
        self.use_se = use_se
        self.return_list = return_list

        self.expand_conv = ConvBNLayer(
            in_c=in_c,
            out_c=mid_c,
            filter_size=1,
            stride=1,
            padding=0,
            act=act,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_expand")
        self.bottleneck_conv = ConvBNLayer(
            in_c=mid_c,
            out_c=mid_c,
            filter_size=filter_size,
            stride=stride,
            padding=int((filter_size - 1) // 2),
            num_groups=mid_c,
            act=act,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_depthwise")
        if self.use_se:
            self.mid_se = SEModule(
                mid_c, lr_mult, conv_decay, name=name + "_se")
        self.linear_conv = ConvBNLayer(
            in_c=mid_c,
            out_c=out_c,
            filter_size=1,
            stride=1,
            padding=0,
            act=None,
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_linear")

    def forward(self, inputs):
        y = self.expand_conv(inputs)
        x = self.bottleneck_conv(y)
        if self.use_se:
            x = self.mid_se(x)
        x = self.linear_conv(x)
        if self.if_shortcut:
            x = paddle.add(inputs, x)
        if self.return_list:
            return [y, x]
        else:
            return x


class SEModule(nn.Layer):
    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2D(1)
        mid_channels = int(channel // reduction)
        self.conv1 = nn.Conv2D(
            in_channels=channel,
            out_channels=mid_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            weight_attr=ParamAttr(
                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
            bias_attr=ParamAttr(
                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
        self.conv2 = nn.Conv2D(
            in_channels=mid_channels,
            out_channels=channel,
            kernel_size=1,
            stride=1,
            padding=0,
            weight_attr=ParamAttr(
                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
            bias_attr=ParamAttr(
                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))

    def forward(self, inputs):
        outputs = self.avg_pool(inputs)
        outputs = self.conv1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
        return paddle.multiply(x=inputs, y=outputs)


class ExtraBlockDW(nn.Layer):
    def __init__(self,
                 in_c,
                 ch_1,
                 ch_2,
                 stride,
                 lr_mult,
                 conv_decay=0.,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=False,
                 name=None):
        super(ExtraBlockDW, self).__init__()
        self.pointwise_conv = ConvBNLayer(
            in_c=in_c,
            out_c=ch_1,
            filter_size=1,
            stride=1,
            padding='SAME',
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra1")
        self.depthwise_conv = ConvBNLayer(
            in_c=ch_1,
            out_c=ch_2,
            filter_size=3,
            stride=stride,
            padding='SAME',
            num_groups=int(ch_1),
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra2_dw")
        self.normal_conv = ConvBNLayer(
            in_c=ch_2,
            out_c=ch_2,
            filter_size=1,
            stride=1,
            padding='SAME',
            act='relu6',
            lr_mult=lr_mult,
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name=name + "_extra2_sep")

    def forward(self, inputs):
        x = self.pointwise_conv(inputs)
        x = self.depthwise_conv(x)
        x = self.normal_conv(x)
        return x


@register
@serializable
class MobileNetV3(nn.Layer):
    __shared__ = ['norm_type']

    def __init__(
            self,
            scale=1.0,
            model_name="large",
            feature_maps=[6, 12, 15],
            with_extra_blocks=False,
            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
            conv_decay=0.0,
            multiplier=1.0,
            norm_type='bn',
            norm_decay=0.0,
            freeze_norm=False):
        super(MobileNetV3, self).__init__()
        if isinstance(feature_maps, Integral):
            feature_maps = [feature_maps]
        if norm_type == 'sync_bn' and freeze_norm:
            raise ValueError(
                "The norm_type should not be sync_bn when freeze_norm is True")
        self.feature_maps = feature_maps
        self.with_extra_blocks = with_extra_blocks
        self.extra_block_filters = extra_block_filters

        inplanes = 16
        if model_name == "large":
            self.cfg = [
                # k, exp, c,  se,     nl,  s,
                [3, 16, 16, False, "relu", 1],
                [3, 64, 24, False, "relu", 2],
                [3, 72, 24, False, "relu", 1],
                [5, 72, 40, True, "relu", 2],  # RCNN output
                [5, 120, 40, True, "relu", 1],
                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
                [3, 200, 80, False, "hard_swish", 1],
                [3, 184, 80, False, "hard_swish", 1],
                [3, 184, 80, False, "hard_swish", 1],
                [3, 480, 112, True, "hard_swish", 1],
                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
                [5, 960, 160, True, "hard_swish", 1],
                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
            ]
        elif model_name == "small":
            self.cfg = [
                # k, exp, c,  se,     nl,  s,
                [3, 16, 16, True, "relu", 2],
                [3, 72, 24, False, "relu", 2],  # RCNN output
                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
                [5, 240, 40, True, "hard_swish", 1],
                [5, 240, 40, True, "hard_swish", 1],
                [5, 120, 48, True, "hard_swish", 1],
                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
                [5, 576, 96, True, "hard_swish", 1],
                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
            ]
        else:
            raise NotImplementedError(
                "mode[{}_model] is not implemented!".format(model_name))

        if multiplier != 1.0:
            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)

        self.conv1 = ConvBNLayer(
            in_c=3,
            out_c=make_divisible(inplanes * scale),
            filter_size=3,
            stride=2,
            padding=1,
            num_groups=1,
            act="hard_swish",
            lr_mult=lr_mult_list[0],
            conv_decay=conv_decay,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            name="conv1")

        self._out_channels = []
        self.block_list = []
        i = 0
        inplanes = make_divisible(inplanes * scale)
        for (k, exp, c, se, nl, s) in self.cfg:
            lr_idx = min(i // 3, len(lr_mult_list) - 1)
            lr_mult = lr_mult_list[lr_idx]

            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
            return_list = self.with_extra_blocks and i + 2 in self.feature_maps

            block = self.add_sublayer(
                "conv" + str(i + 2),
                sublayer=ResidualUnit(
                    in_c=inplanes,
                    mid_c=make_divisible(scale * exp),
                    out_c=make_divisible(scale * c),
                    filter_size=k,
                    stride=s,
                    use_se=se,
                    act=nl,
                    lr_mult=lr_mult,
                    conv_decay=conv_decay,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    return_list=return_list,
                    name="conv" + str(i + 2)))
            self.block_list.append(block)
            inplanes = make_divisible(scale * c)
            i += 1
            self._update_out_channels(
                make_divisible(scale * exp)
                if return_list else inplanes, i + 1, feature_maps)

        if self.with_extra_blocks:
            self.extra_block_list = []
            extra_out_c = make_divisible(scale * self.cfg[-1][1])
            lr_idx = min(i // 3, len(lr_mult_list) - 1)
            lr_mult = lr_mult_list[lr_idx]

            conv_extra = self.add_sublayer(
                "conv" + str(i + 2),
                sublayer=ConvBNLayer(
                    in_c=inplanes,
                    out_c=extra_out_c,
                    filter_size=1,
                    stride=1,
                    padding=0,
                    num_groups=1,
                    act="hard_swish",
                    lr_mult=lr_mult,
                    conv_decay=conv_decay,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    name="conv" + str(i + 2)))
            self.extra_block_list.append(conv_extra)
            i += 1
            self._update_out_channels(extra_out_c, i + 1, feature_maps)

            for j, block_filter in enumerate(self.extra_block_filters):
                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
                                                                           1][1]
                conv_extra = self.add_sublayer(
                    "conv" + str(i + 2),
                    sublayer=ExtraBlockDW(
                        in_c,
                        block_filter[0],
                        block_filter[1],
                        stride=2,
                        lr_mult=lr_mult,
                        conv_decay=conv_decay,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        name='conv' + str(i + 2)))
                self.extra_block_list.append(conv_extra)
                i += 1
                self._update_out_channels(block_filter[1], i + 1, feature_maps)

    def _update_out_channels(self, channel, feature_idx, feature_maps):
        if feature_idx in feature_maps:
            self._out_channels.append(channel)

    def forward(self, inputs):
        x = self.conv1(inputs['image'])
        outs = []
        for idx, block in enumerate(self.block_list):
            x = block(x)
            if idx + 2 in self.feature_maps:
                if isinstance(x, list):
                    outs.append(x[0])
                    x = x[1]
                else:
                    outs.append(x)

        if not self.with_extra_blocks:
            return outs

        for i, block in enumerate(self.extra_block_list):
            idx = i + len(self.block_list)
            x = block(x)
            if idx + 2 in self.feature_maps:
                outs.append(x)
        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/mobileone.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. 
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""

import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant

from ppdet.modeling.ops import get_act_fn
from ppdet.modeling.layers import ConvNormLayer


class MobileOneBlock(nn.Layer):
    def __init__(
            self,
            ch_in,
            ch_out,
            stride,
            kernel_size,
            conv_num=1,
            norm_type='bn',
            norm_decay=0.,
            norm_groups=32,
            bias_on=False,
            lr_scale=1.,
            freeze_norm=False,
            initializer=Normal(
                mean=0., std=0.01),
            skip_quant=False,
            act='relu', ):
        super(MobileOneBlock, self).__init__()

        self.ch_in = ch_in
        self.ch_out = ch_out
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = (kernel_size - 1) // 2
        self.k = conv_num

        self.depth_conv = nn.LayerList()
        self.point_conv = nn.LayerList()
        for _ in range(self.k):
            self.depth_conv.append(
                ConvNormLayer(
                    ch_in,
                    ch_in,
                    kernel_size,
                    stride=stride,
                    groups=ch_in,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    norm_groups=norm_groups,
                    bias_on=bias_on,
                    lr_scale=lr_scale,
                    freeze_norm=freeze_norm,
                    initializer=initializer,
                    skip_quant=skip_quant))
            self.point_conv.append(
                ConvNormLayer(
                    ch_in,
                    ch_out,
                    1,
                    stride=1,
                    groups=1,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    norm_groups=norm_groups,
                    bias_on=bias_on,
                    lr_scale=lr_scale,
                    freeze_norm=freeze_norm,
                    initializer=initializer,
                    skip_quant=skip_quant))
        self.rbr_1x1 = ConvNormLayer(
            ch_in,
            ch_in,
            1,
            stride=self.stride,
            groups=ch_in,
            norm_type=norm_type,
            norm_decay=norm_decay,
            norm_groups=norm_groups,
            bias_on=bias_on,
            lr_scale=lr_scale,
            freeze_norm=freeze_norm,
            initializer=initializer,
            skip_quant=skip_quant)
        self.rbr_identity_st1 = nn.BatchNorm2D(
            num_features=ch_in,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(
                0.0))) if ch_in == ch_out and self.stride == 1 else None
        self.rbr_identity_st2 = nn.BatchNorm2D(
            num_features=ch_out,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(
                0.0))) if ch_in == ch_out and self.stride == 1 else None
        self.act = get_act_fn(act) if act is None or isinstance(act, (
            str, dict)) else act

    def forward(self, x):
        if hasattr(self, "conv1") and hasattr(self, "conv2"):
            y = self.act(self.conv2(self.act(self.conv1(x))))
        else:
            if self.rbr_identity_st1 is None:
                id_out_st1 = 0
            else:
                id_out_st1 = self.rbr_identity_st1(x)

            x1_1 = 0
            for i in range(self.k):
                x1_1 += self.depth_conv[i](x)

            x1_2 = self.rbr_1x1(x)
            x1 = self.act(x1_1 + x1_2 + id_out_st1)

            if self.rbr_identity_st2 is None:
                id_out_st2 = 0
            else:
                id_out_st2 = self.rbr_identity_st2(x1)

            x2_1 = 0
            for i in range(self.k):
                x2_1 += self.point_conv[i](x1)
            y = self.act(x2_1 + id_out_st2)

        return y

    def convert_to_deploy(self):
        if not hasattr(self, 'conv1'):
            self.conv1 = nn.Conv2D(
                in_channels=self.ch_in,
                out_channels=self.ch_in,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                groups=self.ch_in,
                bias_attr=ParamAttr(
                    initializer=Constant(value=0.), learning_rate=1.))
        if not hasattr(self, 'conv2'):
            self.conv2 = nn.Conv2D(
                in_channels=self.ch_in,
                out_channels=self.ch_out,
                kernel_size=1,
                stride=1,
                padding='SAME',
                groups=1,
                bias_attr=ParamAttr(
                    initializer=Constant(value=0.), learning_rate=1.))

        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
        )
        self.conv1.weight.set_value(conv1_kernel)
        self.conv1.bias.set_value(conv1_bias)
        self.conv2.weight.set_value(conv2_kernel)
        self.conv2.bias.set_value(conv2_bias)
        self.__delattr__('depth_conv')
        self.__delattr__('point_conv')
        self.__delattr__('rbr_1x1')
        if hasattr(self, 'rbr_identity_st1'):
            self.__delattr__('rbr_identity_st1')
        if hasattr(self, 'rbr_identity_st2'):
            self.__delattr__('rbr_identity_st2')

    def get_equivalent_kernel_bias(self):
        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
        st1_kernelid, st1_biasid = self._fuse_bn_tensor(
            self.rbr_identity_st1, kernel_size=self.kernel_size)

        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
        st2_kernelid, st2_biasid = self._fuse_bn_tensor(
            self.rbr_identity_st2, kernel_size=1)

        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
            st1_kernel1x1) + st1_kernelid

        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid

        conv2_kernel = st2_kernel1x1 + st2_kernelid
        conv2_bias = st2_bias1x1 + st2_biasid

        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        else:
            padding_size = (self.kernel_size - 1) // 2
            return nn.functional.pad(
                kernel1x1,
                [padding_size, padding_size, padding_size, padding_size])

    def _fuse_bn_tensor(self, branch, kernel_size=3):
        if branch is None:
            return 0, 0

        if isinstance(branch, nn.LayerList):
            fused_kernels = []
            fused_bias = []
            for block in branch:
                kernel = block.conv.weight
                running_mean = block.norm._mean
                running_var = block.norm._variance
                gamma = block.norm.weight
                beta = block.norm.bias
                eps = block.norm._epsilon

                std = (running_var + eps).sqrt()
                t = (gamma / std).reshape((-1, 1, 1, 1))

                fused_kernels.append(kernel * t)
                fused_bias.append(beta - running_mean * gamma / std)

            return sum(fused_kernels), sum(fused_bias)

        elif isinstance(branch, ConvNormLayer):
            kernel = branch.conv.weight
            running_mean = branch.norm._mean
            running_var = branch.norm._variance
            gamma = branch.norm.weight
            beta = branch.norm.bias
            eps = branch.norm._epsilon
        else:
            assert isinstance(branch, nn.BatchNorm2D)
            input_dim = self.ch_in if kernel_size == 1 else 1
            kernel_value = paddle.zeros(
                shape=[self.ch_in, input_dim, kernel_size, kernel_size],
                dtype='float32')
            if kernel_size > 1:
                for i in range(self.ch_in):
                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
                        kernel_size - 1) // 2] = 1
            elif kernel_size == 1:
                for i in range(self.ch_in):
                    kernel_value[i, i % input_dim, 0, 0] = 1
            else:
                raise ValueError("Invalid kernel size recieved!")
            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
            running_mean = branch._mean
            running_var = branch._variance
            gamma = branch.weight
            beta = branch.bias
            eps = branch._epsilon

        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape((-1, 1, 1, 1))

        return kernel * t, beta - running_mean * gamma / std


================================================
FILE: ppdet/modeling/backbones/name_adapter.py
================================================
class NameAdapter(object):
    """Fix the backbones variable names for pretrained weight"""

    def __init__(self, model):
        super(NameAdapter, self).__init__()
        self.model = model

    @property
    def model_type(self):
        return getattr(self.model, '_model_type', '')

    @property
    def variant(self):
        return getattr(self.model, 'variant', '')

    def fix_conv_norm_name(self, name):
        if name == "conv1":
            bn_name = "bn_" + name
        else:
            bn_name = "bn" + name[3:]
        # the naming rule is same as pretrained weight
        if self.model_type == 'SEResNeXt':
            bn_name = name + "_bn"
        return bn_name

    def fix_shortcut_name(self, name):
        if self.model_type == 'SEResNeXt':
            name = 'conv' + name + '_prj'
        return name

    def fix_bottleneck_name(self, name):
        if self.model_type == 'SEResNeXt':
            conv_name1 = 'conv' + name + '_x1'
            conv_name2 = 'conv' + name + '_x2'
            conv_name3 = 'conv' + name + '_x3'
            shortcut_name = name
        else:
            conv_name1 = name + "_branch2a"
            conv_name2 = name + "_branch2b"
            conv_name3 = name + "_branch2c"
            shortcut_name = name + "_branch1"
        return conv_name1, conv_name2, conv_name3, shortcut_name

    def fix_basicblock_name(self, name):
        if self.model_type == 'SEResNeXt':
            conv_name1 = 'conv' + name + '_x1'
            conv_name2 = 'conv' + name + '_x2'
            shortcut_name = name
        else:
            conv_name1 = name + "_branch2a"
            conv_name2 = name + "_branch2b"
            shortcut_name = name + "_branch1"
        return conv_name1, conv_name2, shortcut_name

    def fix_layer_warp_name(self, stage_num, count, i):
        name = 'res' + str(stage_num)
        if count > 10 and stage_num == 4:
            if i == 0:
                conv_name = name + "a"
            else:
                conv_name = name + "b" + str(i)
        else:
            conv_name = name + chr(ord("a") + i)
        if self.model_type == 'SEResNeXt':
            conv_name = str(stage_num + 2) + '_' + str(i + 1)
        return conv_name

    def fix_c1_stage_name(self):
        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"


================================================
FILE: ppdet/modeling/backbones/res2net.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from numbers import Integral

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .resnet import ConvNormLayer

__all__ = ['Res2Net', 'Res2NetC5']

Res2Net_cfg = {
    50: [3, 4, 6, 3],
    101: [3, 4, 23, 3],
    152: [3, 8, 36, 3],
    200: [3, 12, 48, 3]
}


class BottleNeck(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 stride,
                 shortcut,
                 width,
                 scales=4,
                 variant='b',
                 groups=1,
                 lr=1.0,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 dcn_v2=False):
        super(BottleNeck, self).__init__()

        self.shortcut = shortcut
        self.scales = scales
        self.stride = stride
        if not shortcut:
            if variant == 'd' and stride == 2:
                self.branch1 = nn.Sequential()
                self.branch1.add_sublayer(
                    'pool',
                    nn.AvgPool2D(
                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
                self.branch1.add_sublayer(
                    'conv',
                    ConvNormLayer(
                        ch_in=ch_in,
                        ch_out=ch_out,
                        filter_size=1,
                        stride=1,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        lr=lr))
            else:
                self.branch1 = ConvNormLayer(
                    ch_in=ch_in,
                    ch_out=ch_out,
                    filter_size=1,
                    stride=stride,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    lr=lr)

        self.branch2a = ConvNormLayer(
            ch_in=ch_in,
            ch_out=width * scales,
            filter_size=1,
            stride=stride if variant == 'a' else 1,
            groups=1,
            act='relu',
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr)

        self.branch2b = nn.LayerList([
            ConvNormLayer(
                ch_in=width,
                ch_out=width,
                filter_size=3,
                stride=1 if variant == 'a' else stride,
                groups=groups,
                act='relu',
                norm_type=norm_type,
                norm_decay=norm_decay,
                freeze_norm=freeze_norm,
                lr=lr,
                dcn_v2=dcn_v2) for _ in range(self.scales - 1)
        ])

        self.branch2c = ConvNormLayer(
            ch_in=width * scales,
            ch_out=ch_out,
            filter_size=1,
            stride=1,
            groups=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr)

    def forward(self, inputs):

        out = self.branch2a(inputs)
        feature_split = paddle.split(out, self.scales, 1)
        out_split = []
        for i in range(self.scales - 1):
            if i == 0 or self.stride == 2:
                out_split.append(self.branch2b[i](feature_split[i]))
            else:
                out_split.append(self.branch2b[i](paddle.add(feature_split[i],
                                                             out_split[-1])))
        if self.stride == 1:
            out_split.append(feature_split[-1])
        else:
            out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1))
        out = self.branch2c(paddle.concat(out_split, 1))

        if self.shortcut:
            short = inputs
        else:
            short = self.branch1(inputs)

        out = paddle.add(out, short)
        out = F.relu(out)

        return out


class Blocks(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 count,
                 stage_num,
                 width,
                 scales=4,
                 variant='b',
                 groups=1,
                 lr=1.0,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 dcn_v2=False):
        super(Blocks, self).__init__()

        self.blocks = nn.Sequential()
        for i in range(count):
            self.blocks.add_sublayer(
                str(i),
                BottleNeck(
                    ch_in=ch_in if i == 0 else ch_out,
                    ch_out=ch_out,
                    stride=2 if i == 0 and stage_num != 2 else 1,
                    shortcut=False if i == 0 else True,
                    width=width * (2**(stage_num - 2)),
                    scales=scales,
                    variant=variant,
                    groups=groups,
                    lr=lr,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    dcn_v2=dcn_v2))

    def forward(self, inputs):
        return self.blocks(inputs)


@register
@serializable
class Res2Net(nn.Layer):
    """
    Res2Net, see https://arxiv.org/abs/1904.01169
    Args:
        depth (int): Res2Net depth, should be 50, 101, 152, 200.
        width (int): Res2Net width
        scales (int): Res2Net scale
        variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently
        lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
                             lower learning rate ratio is need for pretrained model
                             got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
        groups (int): The groups number of the Conv Layer.
        norm_type (str): normalization type, 'bn' or 'sync_bn'
        norm_decay (float): weight decay for normalization layer weights
        freeze_norm (bool): freeze normalization layers
        freeze_at (int): freeze the backbone at which stage
        return_idx (list): index of stages whose feature maps are returned,
                           index 0 stands for res2
        dcn_v2_stages (list): index of stages who select deformable conv v2
        num_stages (int): number of stages created

    """
    __shared__ = ['norm_type']

    def __init__(self,
                 depth=50,
                 width=26,
                 scales=4,
                 variant='b',
                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
                 groups=1,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 freeze_at=0,
                 return_idx=[0, 1, 2, 3],
                 dcn_v2_stages=[-1],
                 num_stages=4):
        super(Res2Net, self).__init__()

        self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt'

        assert depth in [50, 101, 152, 200], \
            "depth {} not in [50, 101, 152, 200]"
        assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant"
        assert num_stages >= 1 and num_stages <= 4

        self.depth = depth
        self.variant = variant
        self.norm_type = norm_type
        self.norm_decay = norm_decay
        self.freeze_norm = freeze_norm
        self.freeze_at = freeze_at
        if isinstance(return_idx, Integral):
            return_idx = [return_idx]
        assert max(return_idx) < num_stages, \
            'the maximum return index must smaller than num_stages, ' \
            'but received maximum return index is {} and num_stages ' \
            'is {}'.format(max(return_idx), num_stages)
        self.return_idx = return_idx
        self.num_stages = num_stages
        assert len(lr_mult_list) == 4, \
            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
        if isinstance(dcn_v2_stages, Integral):
            dcn_v2_stages = [dcn_v2_stages]
        assert max(dcn_v2_stages) < num_stages
        self.dcn_v2_stages = dcn_v2_stages

        block_nums = Res2Net_cfg[depth]

        # C1 stage
        if self.variant in ['c', 'd']:
            conv_def = [
                [3, 32, 3, 2, "conv1_1"],
                [32, 32, 3, 1, "conv1_2"],
                [32, 64, 3, 1, "conv1_3"],
            ]
        else:
            conv_def = [[3, 64, 7, 2, "conv1"]]
        self.res1 = nn.Sequential()
        for (c_in, c_out, k, s, _name) in conv_def:
            self.res1.add_sublayer(
                _name,
                ConvNormLayer(
                    ch_in=c_in,
                    ch_out=c_out,
                    filter_size=k,
                    stride=s,
                    groups=1,
                    act='relu',
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    lr=1.0))

        self._in_channels = [64, 256, 512, 1024]
        self._out_channels = [256, 512, 1024, 2048]
        self._out_strides = [4, 8, 16, 32]

        # C2-C5 stages
        self.res_layers = []
        for i in range(num_stages):
            lr_mult = lr_mult_list[i]
            stage_num = i + 2
            self.res_layers.append(
                self.add_sublayer(
                    "res{}".format(stage_num),
                    Blocks(
                        self._in_channels[i],
                        self._out_channels[i],
                        count=block_nums[i],
                        stage_num=stage_num,
                        width=width,
                        scales=scales,
                        groups=groups,
                        lr=lr_mult,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        dcn_v2=(i in self.dcn_v2_stages))))

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]

    def forward(self, inputs):
        x = inputs['image']
        res1 = self.res1(x)
        x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1)
        outs = []
        for idx, stage in enumerate(self.res_layers):
            x = stage(x)
            if idx == self.freeze_at:
                x.stop_gradient = True
            if idx in self.return_idx:
                outs.append(x)
        return outs


@register
class Res2NetC5(nn.Layer):
    def __init__(self, depth=50, width=26, scales=4, variant='b'):
        super(Res2NetC5, self).__init__()
        feat_in, feat_out = [1024, 2048]
        self.res5 = Blocks(
            feat_in,
            feat_out,
            count=3,
            stage_num=5,
            width=width,
            scales=scales,
            variant=variant)
        self.feat_out = feat_out

    @property
    def out_shape(self):
        return [ShapeSpec(
            channels=self.feat_out,
            stride=32, )]

    def forward(self, roi_feat, stage=0):
        y = self.res5(roi_feat)
        return y


================================================
FILE: ppdet/modeling/backbones/resnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import math
from numbers import Integral

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Uniform
from paddle import ParamAttr
from paddle.nn.initializer import Constant
from paddle.vision.ops import DeformConv2D
from .name_adapter import NameAdapter
from ..shape_spec import ShapeSpec

__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']

ResNet_cfg = {
    18: [2, 2, 2, 2],
    34: [3, 4, 6, 3],
    50: [3, 4, 6, 3],
    101: [3, 4, 23, 3],
    152: [3, 8, 36, 3],
}


class ConvNormLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 stride,
                 groups=1,
                 act=None,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 lr=1.0,
                 dcn_v2=False):
        super(ConvNormLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn']
        self.norm_type = norm_type
        self.act = act
        self.dcn_v2 = dcn_v2

        if not self.dcn_v2:
            self.conv = nn.Conv2D(
                in_channels=ch_in,
                out_channels=ch_out,
                kernel_size=filter_size,
                stride=stride,
                padding=(filter_size - 1) // 2,
                groups=groups,
                weight_attr=ParamAttr(learning_rate=lr),
                bias_attr=False)
        else:
            self.offset_channel = 2 * filter_size**2
            self.mask_channel = filter_size**2

            self.conv_offset = nn.Conv2D(
                in_channels=ch_in,
                out_channels=3 * filter_size**2,
                kernel_size=filter_size,
                stride=stride,
                padding=(filter_size - 1) // 2,
                weight_attr=ParamAttr(initializer=Constant(0.)),
                bias_attr=ParamAttr(initializer=Constant(0.)))
            self.conv = DeformConv2D(
                in_channels=ch_in,
                out_channels=ch_out,
                kernel_size=filter_size,
                stride=stride,
                padding=(filter_size - 1) // 2,
                dilation=1,
                groups=groups,
                weight_attr=ParamAttr(learning_rate=lr),
                bias_attr=False)

        norm_lr = 0. if freeze_norm else lr
        param_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            trainable=False if freeze_norm else True)
        bias_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            trainable=False if freeze_norm else True)

        global_stats = True if freeze_norm else None
        if norm_type in ['sync_bn', 'bn']:
            self.norm = nn.BatchNorm2D(
                ch_out,
                weight_attr=param_attr,
                bias_attr=bias_attr,
                use_global_stats=global_stats)
        norm_params = self.norm.parameters()

        if freeze_norm:
            for param in norm_params:
                param.stop_gradient = True

    def forward(self, inputs):
        if not self.dcn_v2:
            out = self.conv(inputs)
        else:
            offset_mask = self.conv_offset(inputs)
            offset, mask = paddle.split(
                offset_mask,
                num_or_sections=[self.offset_channel, self.mask_channel],
                axis=1)
            mask = F.sigmoid(mask)
            out = self.conv(inputs, offset, mask=mask)

        if self.norm_type in ['bn', 'sync_bn']:
            out = self.norm(out)
        if self.act:
            out = getattr(F, self.act)(out)
        return out


class SELayer(nn.Layer):
    def __init__(self, ch, reduction_ratio=16):
        super(SELayer, self).__init__()
        self.pool = nn.AdaptiveAvgPool2D(1)
        stdv = 1.0 / math.sqrt(ch)
        c_ = ch // reduction_ratio
        self.squeeze = nn.Linear(
            ch,
            c_,
            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
            bias_attr=True)

        stdv = 1.0 / math.sqrt(c_)
        self.extract = nn.Linear(
            c_,
            ch,
            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
            bias_attr=True)

    def forward(self, inputs):
        out = self.pool(inputs)
        out = paddle.squeeze(out, axis=[2, 3])
        out = self.squeeze(out)
        out = F.relu(out)
        out = self.extract(out)
        out = F.sigmoid(out)
        out = paddle.unsqueeze(out, axis=[2, 3])
        scale = out * inputs
        return scale


class BasicBlock(nn.Layer):

    expansion = 1

    def __init__(self,
                 ch_in,
                 ch_out,
                 stride,
                 shortcut,
                 variant='b',
                 groups=1,
                 base_width=64,
                 lr=1.0,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 dcn_v2=False,
                 std_senet=False):
        super(BasicBlock, self).__init__()
        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'

        self.shortcut = shortcut
        if not shortcut:
            if variant == 'd' and stride == 2:
                self.short = nn.Sequential()
                self.short.add_sublayer(
                    'pool',
                    nn.AvgPool2D(
                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
                self.short.add_sublayer(
                    'conv',
                    ConvNormLayer(
                        ch_in=ch_in,
                        ch_out=ch_out,
                        filter_size=1,
                        stride=1,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        lr=lr))
            else:
                self.short = ConvNormLayer(
                    ch_in=ch_in,
                    ch_out=ch_out,
                    filter_size=1,
                    stride=stride,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    lr=lr)

        self.branch2a = ConvNormLayer(
            ch_in=ch_in,
            ch_out=ch_out,
            filter_size=3,
            stride=stride,
            act='relu',
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr)

        self.branch2b = ConvNormLayer(
            ch_in=ch_out,
            ch_out=ch_out,
            filter_size=3,
            stride=1,
            act=None,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr,
            dcn_v2=dcn_v2)

        self.std_senet = std_senet
        if self.std_senet:
            self.se = SELayer(ch_out)

    def forward(self, inputs):
        out = self.branch2a(inputs)
        out = self.branch2b(out)
        if self.std_senet:
            out = self.se(out)

        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)

        out = paddle.add(x=out, y=short)
        out = F.relu(out)

        return out


class BottleNeck(nn.Layer):

    expansion = 4

    def __init__(self,
                 ch_in,
                 ch_out,
                 stride,
                 shortcut,
                 variant='b',
                 groups=1,
                 base_width=4,
                 lr=1.0,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 dcn_v2=False,
                 std_senet=False):
        super(BottleNeck, self).__init__()
        if variant == 'a':
            stride1, stride2 = stride, 1
        else:
            stride1, stride2 = 1, stride

        # ResNeXt
        width = int(ch_out * (base_width / 64.)) * groups

        self.branch2a = ConvNormLayer(
            ch_in=ch_in,
            ch_out=width,
            filter_size=1,
            stride=stride1,
            groups=1,
            act='relu',
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr)

        self.branch2b = ConvNormLayer(
            ch_in=width,
            ch_out=width,
            filter_size=3,
            stride=stride2,
            groups=groups,
            act='relu',
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr,
            dcn_v2=dcn_v2)

        self.branch2c = ConvNormLayer(
            ch_in=width,
            ch_out=ch_out * self.expansion,
            filter_size=1,
            stride=1,
            groups=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            lr=lr)

        self.shortcut = shortcut
        if not shortcut:
            if variant == 'd' and stride == 2:
                self.short = nn.Sequential()
                self.short.add_sublayer(
                    'pool',
                    nn.AvgPool2D(
                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
                self.short.add_sublayer(
                    'conv',
                    ConvNormLayer(
                        ch_in=ch_in,
                        ch_out=ch_out * self.expansion,
                        filter_size=1,
                        stride=1,
                        norm_type=norm_type,
                        norm_decay=norm_decay,
                        freeze_norm=freeze_norm,
                        lr=lr))
            else:
                self.short = ConvNormLayer(
                    ch_in=ch_in,
                    ch_out=ch_out * self.expansion,
                    filter_size=1,
                    stride=stride,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    lr=lr)

        self.std_senet = std_senet
        if self.std_senet:
            self.se = SELayer(ch_out * self.expansion)

    def forward(self, inputs):

        out = self.branch2a(inputs)
        out = self.branch2b(out)
        out = self.branch2c(out)

        if self.std_senet:
            out = self.se(out)

        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)

        out = paddle.add(x=out, y=short)
        out = F.relu(out)

        return out


class Blocks(nn.Layer):
    def __init__(self,
                 block,
                 ch_in,
                 ch_out,
                 count,
                 name_adapter,
                 stage_num,
                 variant='b',
                 groups=1,
                 base_width=64,
                 lr=1.0,
                 norm_type='bn',
                 norm_decay=0.,
                 freeze_norm=True,
                 dcn_v2=False,
                 std_senet=False):
        super(Blocks, self).__init__()

        self.blocks = []
        for i in range(count):
            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
            layer = self.add_sublayer(
                conv_name,
                block(
                    ch_in=ch_in,
                    ch_out=ch_out,
                    stride=2 if i == 0 and stage_num != 2 else 1,
                    shortcut=False if i == 0 else True,
                    variant=variant,
                    groups=groups,
                    base_width=base_width,
                    lr=lr,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    dcn_v2=dcn_v2,
                    std_senet=std_senet))
            self.blocks.append(layer)
            if i == 0:
                ch_in = ch_out * block.expansion

    def forward(self, inputs):
        block_out = inputs
        for block in self.blocks:
            block_out = block(block_out)
        return block_out


@register
@serializable
class ResNet(nn.Layer):
    __shared__ = ['norm_type']

    def __init__(self,
                 depth=50,
                 ch_in=64,
                 variant='b',
                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
                 groups=1,
                 base_width=64,
                 norm_type='bn',
                 norm_decay=0,
                 freeze_norm=True,
                 freeze_at=0,
                 return_idx=[0, 1, 2, 3],
                 dcn_v2_stages=[-1],
                 num_stages=4,
                 std_senet=False,
                 freeze_stem_only=False):
        """
        Residual Network, see https://arxiv.org/abs/1512.03385
        
        Args:
            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
            ch_in (int): output channel of first stage, default 64
            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
                                 lower learning rate ratio is need for pretrained model 
                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
            groups (int): group convolution cardinality
            base_width (int): base width of each group convolution
            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
            norm_decay (float): weight decay for normalization layer weights
            freeze_norm (bool): freeze normalization layers
            freeze_at (int): freeze the backbone at which stage
            return_idx (list): index of the stages whose feature maps are returned
            dcn_v2_stages (list): index of stages who select deformable conv v2
            num_stages (int): total num of stages
            std_senet (bool): whether use senet, default True
        """
        super(ResNet, self).__init__()
        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
        assert num_stages >= 1 and num_stages <= 4
        self.depth = depth
        self.variant = variant
        self.groups = groups
        self.base_width = base_width
        self.norm_type = norm_type
        self.norm_decay = norm_decay
        self.freeze_norm = freeze_norm
        self.freeze_at = freeze_at
        if isinstance(return_idx, Integral):
            return_idx = [return_idx]
        assert max(return_idx) < num_stages, \
            'the maximum return index must smaller than num_stages, ' \
            'but received maximum return index is {} and num_stages ' \
            'is {}'.format(max(return_idx), num_stages)
        self.return_idx = return_idx
        self.num_stages = num_stages
        assert len(lr_mult_list) == 4, \
            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
        if isinstance(dcn_v2_stages, Integral):
            dcn_v2_stages = [dcn_v2_stages]
        assert max(dcn_v2_stages) < num_stages

        if isinstance(dcn_v2_stages, Integral):
            dcn_v2_stages = [dcn_v2_stages]
        assert max(dcn_v2_stages) < num_stages
        self.dcn_v2_stages = dcn_v2_stages

        block_nums = ResNet_cfg[depth]
        na = NameAdapter(self)

        conv1_name = na.fix_c1_stage_name()
        if variant in ['c', 'd']:
            conv_def = [
                [3, ch_in // 2, 3, 2, "conv1_1"],
                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
            ]
        else:
            conv_def = [[3, ch_in, 7, 2, conv1_name]]
        self.conv1 = nn.Sequential()
        for (c_in, c_out, k, s, _name) in conv_def:
            self.conv1.add_sublayer(
                _name,
                ConvNormLayer(
                    ch_in=c_in,
                    ch_out=c_out,
                    filter_size=k,
                    stride=s,
                    groups=1,
                    act='relu',
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    lr=1.0))

        self.ch_in = ch_in
        ch_out_list = [64, 128, 256, 512]
        block = BottleNeck if depth >= 50 else BasicBlock

        self._out_channels = [block.expansion * v for v in ch_out_list]
        self._out_strides = [4, 8, 16, 32]

        self.res_layers = []
        for i in range(num_stages):
            lr_mult = lr_mult_list[i]
            stage_num = i + 2
            res_name = "res{}".format(stage_num)
            res_layer = self.add_sublayer(
                res_name,
                Blocks(
                    block,
                    self.ch_in,
                    ch_out_list[i],
                    count=block_nums[i],
                    name_adapter=na,
                    stage_num=stage_num,
                    variant=variant,
                    groups=groups,
                    base_width=base_width,
                    lr=lr_mult,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
                    freeze_norm=freeze_norm,
                    dcn_v2=(i in self.dcn_v2_stages),
                    std_senet=std_senet))
            self.res_layers.append(res_layer)
            self.ch_in = self._out_channels[i]

        if freeze_at >= 0:
            self._freeze_parameters(self.conv1)
            if not freeze_stem_only:
                for i in range(min(freeze_at + 1, num_stages)):
                    self._freeze_parameters(self.res_layers[i])

    def _freeze_parameters(self, m):
        for p in m.parameters():
            p.stop_gradient = True

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self._out_channels[i], stride=self._out_strides[i])
            for i in self.return_idx
        ]

    def forward(self, inputs):
        x = inputs['image']
        conv1 = self.conv1(x)
        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
        outs = []
        for idx, stage in enumerate(self.res_layers):
            x = stage(x)
            if idx in self.return_idx:
                outs.append(x)
        return outs


@register
class Res5Head(nn.Layer):
    def __init__(self, depth=50):
        super(Res5Head, self).__init__()
        feat_in, feat_out = [1024, 512]
        if depth < 50:
            feat_in = 256
        na = NameAdapter(self)
        block = BottleNeck if depth >= 50 else BasicBlock
        self.res5 = Blocks(
            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
        self.feat_out = feat_out if depth < 50 else feat_out * 4

    @property
    def out_shape(self):
        return [ShapeSpec(
            channels=self.feat_out,
            stride=16, )]

    def forward(self, roi_feat, stage=0):
        y = self.res5(roi_feat)
        return y


================================================
FILE: ppdet/modeling/backbones/senet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle.nn as nn

from ppdet.core.workspace import register, serializable
from .resnet import ResNet, Blocks, BasicBlock, BottleNeck
from ..shape_spec import ShapeSpec
from .name_adapter import NameAdapter

__all__ = ['SENet', 'SERes5Head']


@register
@serializable
class SENet(ResNet):
    __shared__ = ['norm_type']

    def __init__(self,
                 depth=50,
                 variant='b',
                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
                 groups=1,
                 base_width=64,
                 norm_type='bn',
                 norm_decay=0,
                 freeze_norm=True,
                 freeze_at=0,
                 return_idx=[0, 1, 2, 3],
                 dcn_v2_stages=[-1],
                 std_senet=True,
                 num_stages=4):
        """
        Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507
        
        Args:
            depth (int): SENet depth, should be 50, 101, 152
            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
                                 lower learning rate ratio is need for pretrained model 
                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
            groups (int): group convolution cardinality
            base_width (int): base width of each group convolution
            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
            norm_decay (float): weight decay for normalization layer weights
            freeze_norm (bool): freeze normalization layers
            freeze_at (int): freeze the backbone at which stage
            return_idx (list): index of the stages whose feature maps are returned
            dcn_v2_stages (list): index of stages who select deformable conv v2
            std_senet (bool): whether use senet, default True
            num_stages (int): total num of stages
        """

        super(SENet, self).__init__(
            depth=depth,
            variant=variant,
            lr_mult_list=lr_mult_list,
            ch_in=128,
            groups=groups,
            base_width=base_width,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            freeze_at=freeze_at,
            return_idx=return_idx,
            dcn_v2_stages=dcn_v2_stages,
            std_senet=std_senet,
            num_stages=num_stages)


@register
class SERes5Head(nn.Layer):
    def __init__(self,
                 depth=50,
                 variant='b',
                 lr_mult=1.0,
                 groups=1,
                 base_width=64,
                 norm_type='bn',
                 norm_decay=0,
                 dcn_v2=False,
                 freeze_norm=False,
                 std_senet=True):
        """
        SERes5Head layer

        Args:
            depth (int): SENet depth, should be 50, 101, 152
            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
            lr_mult (list): learning rate ratio of SERes5Head, default as 1.0.
            groups (int): group convolution cardinality
            base_width (int): base width of each group convolution
            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
            norm_decay (float): weight decay for normalization layer weights
            dcn_v2_stages (list): index of stages who select deformable conv v2
            std_senet (bool): whether use senet, default True
            
        """
        super(SERes5Head, self).__init__()
        ch_out = 512
        ch_in = 256 if depth < 50 else 1024
        na = NameAdapter(self)
        block = BottleNeck if depth >= 50 else BasicBlock
        self.res5 = Blocks(
            block,
            ch_in,
            ch_out,
            count=3,
            name_adapter=na,
            stage_num=5,
            variant=variant,
            groups=groups,
            base_width=base_width,
            lr=lr_mult,
            norm_type=norm_type,
            norm_decay=norm_decay,
            freeze_norm=freeze_norm,
            dcn_v2=dcn_v2,
            std_senet=std_senet)
        self.ch_out = ch_out * block.expansion

    @property
    def out_shape(self):
        return [ShapeSpec(
            channels=self.ch_out,
            stride=16, )]

    def forward(self, roi_feat):
        y = self.res5(roi_feat)
        return y


================================================
FILE: ppdet/modeling/backbones/shufflenet_v2.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle

__all__ = ['ShuffleNetV2']


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 groups=1,
                 act=None):
        super(ConvBNLayer, self).__init__()
        self._conv = Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=groups,
            weight_attr=ParamAttr(initializer=KaimingNormal()),
            bias_attr=False)

        self._batch_norm = BatchNorm2D(
            out_channels,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        if act == "hard_swish":
            act = 'hardswish'
        self.act = act

    def forward(self, inputs):
        y = self._conv(inputs)
        y = self._batch_norm(y)
        if self.act:
            y = getattr(F, self.act)(y)
        return y


class InvertedResidual(nn.Layer):
    def __init__(self, in_channels, out_channels, stride, act="relu"):
        super(InvertedResidual, self).__init__()
        self._conv_pw = ConvBNLayer(
            in_channels=in_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        self._conv_dw = ConvBNLayer(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=out_channels // 2,
            act=None)
        self._conv_linear = ConvBNLayer(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)

    def forward(self, inputs):
        x1, x2 = paddle.split(
            inputs,
            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
            axis=1)
        x2 = self._conv_pw(x2)
        x2 = self._conv_dw(x2)
        x2 = self._conv_linear(x2)
        out = paddle.concat([x1, x2], axis=1)
        return channel_shuffle(out, 2)


class InvertedResidualDS(nn.Layer):
    def __init__(self, in_channels, out_channels, stride, act="relu"):
        super(InvertedResidualDS, self).__init__()

        # branch1
        self._conv_dw_1 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=in_channels,
            act=None)
        self._conv_linear_1 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        # branch2
        self._conv_pw_2 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)
        self._conv_dw_2 = ConvBNLayer(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=out_channels // 2,
            act=None)
        self._conv_linear_2 = ConvBNLayer(
            in_channels=out_channels // 2,
            out_channels=out_channels // 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=1,
            act=act)

    def forward(self, inputs):
        x1 = self._conv_dw_1(inputs)
        x1 = self._conv_linear_1(x1)
        x2 = self._conv_pw_2(inputs)
        x2 = self._conv_dw_2(x2)
        x2 = self._conv_linear_2(x2)
        out = paddle.concat([x1, x2], axis=1)

        return channel_shuffle(out, 2)


@register
@serializable
class ShuffleNetV2(nn.Layer):
    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
        super(ShuffleNetV2, self).__init__()
        self.scale = scale
        if isinstance(feature_maps, Integral):
            feature_maps = [feature_maps]
        self.feature_maps = feature_maps
        stage_repeats = [4, 8, 4]

        if scale == 0.25:
            stage_out_channels = [-1, 24, 24, 48, 96, 512]
        elif scale == 0.33:
            stage_out_channels = [-1, 24, 32, 64, 128, 512]
        elif scale == 0.5:
            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
        elif scale == 1.0:
            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
        elif scale == 1.5:
            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
        elif scale == 2.0:
            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
        else:
            raise NotImplementedError("This scale size:[" + str(scale) +
                                      "] is not implemented!")
        self._out_channels = []
        self._feature_idx = 0
        # 1. conv1
        self._conv1 = ConvBNLayer(
            in_channels=3,
            out_channels=stage_out_channels[1],
            kernel_size=3,
            stride=2,
            padding=1,
            act=act)
        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
        self._feature_idx += 1

        # 2. bottleneck sequences
        self._block_list = []
        for stage_id, num_repeat in enumerate(stage_repeats):
            for i in range(num_repeat):
                if i == 0:
                    block = self.add_sublayer(
                        name=str(stage_id + 2) + '_' + str(i + 1),
                        sublayer=InvertedResidualDS(
                            in_channels=stage_out_channels[stage_id + 1],
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=2,
                            act=act))
                else:
                    block = self.add_sublayer(
                        name=str(stage_id + 2) + '_' + str(i + 1),
                        sublayer=InvertedResidual(
                            in_channels=stage_out_channels[stage_id + 2],
                            out_channels=stage_out_channels[stage_id + 2],
                            stride=1,
                            act=act))
                self._block_list.append(block)
                self._feature_idx += 1
                self._update_out_channels(stage_out_channels[stage_id + 2],
                                          self._feature_idx, self.feature_maps)

    def _update_out_channels(self, channel, feature_idx, feature_maps):
        if feature_idx in feature_maps:
            self._out_channels.append(channel)

    def forward(self, inputs):
        y = self._conv1(inputs['image'])
        y = self._max_pool(y)
        outs = []
        for i, inv in enumerate(self._block_list):
            y = inv(y)
            if i + 2 in self.feature_maps:
                outs.append(y)

        return outs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/swin_transformer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_

__all__ = ['SwinTransformer']

MODEL_cfg = {
    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
    'swin_T_224': dict(
        pretrain_img_size=224,
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
    ),
    'swin_S_224': dict(
        pretrain_img_size=224,
        embed_dim=96,
        depths=[2, 2, 18, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
    ),
    'swin_B_224': dict(
        pretrain_img_size=224,
        embed_dim=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
    ),
    'swin_L_224': dict(
        pretrain_img_size=224,
        embed_dim=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
    ),
    'swin_B_384': dict(
        pretrain_img_size=384,
        embed_dim=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=12,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
    ),
    'swin_L_384': dict(
        pretrain_img_size=384,
        embed_dim=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=12,
        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
    ),
}


class Mlp(nn.Layer):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size
    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.reshape(
        [-1, H // window_size, window_size, W // window_size, window_size, C])
    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
        [-1, window_size, window_size, C])
    return windows


def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image
    Returns:
        x: (B, H, W, C)
    """
    _, _, _, C = windows.shape
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.reshape(
        [-1, H // window_size, W // window_size, window_size, window_size, C])
    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
    return x


class WindowAttention(nn.Layer):
    """ Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = add_parameter(
            self,
            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = paddle.arange(self.window_size[0])
        coords_w = paddle.arange(self.window_size[1])
        coords = paddle.stack(paddle.meshgrid(
            [coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
        relative_coords = coords_flatten_1 - coords_flatten_2
        relative_coords = relative_coords.transpose(
            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[
            0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table)
        self.softmax = nn.Softmax(axis=-1)

    def forward(self, x, mask=None):
        """ Forward function.
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(
            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
                [2, 0, 3, 1, 4])
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * self.scale
        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))

        index = self.relative_position_index.flatten()

        relative_position_bias = paddle.index_select(
            self.relative_position_bias_table, index)
        relative_position_bias = relative_position_bias.reshape([
            self.window_size[0] * self.window_size[1],
            self.window_size[0] * self.window_size[1], -1
        ])  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.transpose(
            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.reshape([-1, nW, self.num_heads, N, N
                                 ]) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.reshape([-1, self.num_heads, N, N])
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class SwinTransformerBlock(nn.Layer):
    """ Swin Transformer Block.
    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, mask_matrix):
        """ Forward function.
        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, "input feature has wrong size"

        shortcut = x
        x = self.norm1(x)
        x = x.reshape([-1, H, W, C])

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
                  data_format='NHWC')
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = paddle.roll(
                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.reshape(
            [x_windows.shape[0], self.window_size * self.window_size,
             C])  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.reshape(
            [x_windows.shape[0], self.window_size, self.window_size, C])
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = paddle.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                axis=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :]

        x = x.reshape([-1, H * W, C])

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class PatchMerging(nn.Layer):
    r""" Patch Merging Layer.
    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x, H, W):
        """ Forward function.
        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"

        x = x.reshape([-1, H, W, C])

        # padding
        pad_input = (H % 2 == 1) or (W % 2 == 1)
        if pad_input:
            # paddle F.pad default data_format is 'NCHW'
            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
            H += H % 2
            W += W % 2

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x


class BasicLayer(nn.Layer):
    """ A basic Swin Transformer layer for one stage.
    Args:
        dim (int): Number of input channels.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth

        # build blocks
        self.blocks = nn.LayerList([
            SwinTransformerBlock(
                dim=dim,
                num_heads=num_heads,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, np.ndarray) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, H, W):
        """ Forward function.
        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt

                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.reshape(
            [-1, self.window_size * self.window_size])
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        huns = -100.0 * paddle.ones_like(attn_mask)
        attn_mask = huns * (attn_mask != 0).astype("float32")

        for blk in self.blocks:
            blk.H, blk.W = H, W
            x = blk(x, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


class PatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Layer, optional): Normalization layer. Default: None
    """

    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2D(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        # TODO # export dynamic shape
        B, C, H, W = x.shape
        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
        if W % self.patch_size[1] != 0:
            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
        if H % self.patch_size[0] != 0:
            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])

        x = self.proj(x)
        if self.norm is not None:
            _, _, Wh, Ww = x.shape
            x = x.flatten(2).transpose([0, 2, 1])
            x = self.norm(x)
            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])

        return x


@register
@serializable
class SwinTransformer(nn.Layer):
    """ Swin Transformer backbone
    Args:
        arch (str): Architecture of FocalNet
        pretrain_img_size (int | tuple(int)): Input image size. Default 224
        patch_size (int | tuple(int)): Patch size. Default: 4
        in_chans (int): Number of input image channels. Default: 3
        embed_dim (int): Patch embedding dimension. Default: 96
        depths (tuple(int)): Depth of each Swin Transformer layer.
        num_heads (tuple(int)): Number of attention heads in different layers.
        window_size (int): Window size. Default: 7
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
        drop_rate (float): Dropout rate. Default: 0
        attn_drop_rate (float): Attention dropout rate. Default: 0
        drop_path_rate (float): Stochastic depth rate. Default: 0.1
        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
        patch_norm (bool): If True, add normalization after patch embedding. Default: True
    """

    def __init__(self,
                 arch='swin_T_224',
                 pretrain_img_size=224,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 depths=[2, 2, 6, 2],
                 num_heads=[3, 6, 12, 24],
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.2,
                 norm_layer=nn.LayerNorm,
                 ape=False,
                 patch_norm=True,
                 out_indices=(0, 1, 2, 3),
                 frozen_stages=-1,
                 pretrained=None):
        super(SwinTransformer, self).__init__()
        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)

        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
        embed_dim = MODEL_cfg[arch]['embed_dim']
        depths = MODEL_cfg[arch]['depths']
        num_heads = MODEL_cfg[arch]['num_heads']
        window_size = MODEL_cfg[arch]['window_size']
        if pretrained is None:
            pretrained = MODEL_cfg[arch]['pretrained']

        self.num_layers = len(depths)
        self.ape = ape
        self.patch_norm = patch_norm
        self.out_indices = out_indices
        self.frozen_stages = frozen_stages

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)

        # absolute position embedding
        if self.ape:
            pretrain_img_size = to_2tuple(pretrain_img_size)
            patch_size = to_2tuple(patch_size)
            patches_resolution = [
                pretrain_img_size[0] // patch_size[0],
                pretrain_img_size[1] // patch_size[1]
            ]

            self.absolute_pos_embed = add_parameter(
                self,
                paddle.zeros((1, embed_dim, patches_resolution[0],
                              patches_resolution[1])))
            trunc_normal_(self.absolute_pos_embed)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = np.linspace(0, drop_path_rate,
                          sum(depths))  # stochastic depth decay rule

        # build layers
        self.layers = nn.LayerList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging
                if (i_layer < self.num_layers - 1) else None)
            self.layers.append(layer)

        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
        self.num_features = num_features

        # add a norm layer for each output
        for i_layer in out_indices:
            layer = norm_layer(num_features[i_layer])
            layer_name = f'norm{i_layer}'
            self.add_sublayer(layer_name, layer)

        self.apply(self._init_weights)
        self._freeze_stages()
        if pretrained:
            if 'http' in pretrained:  #URL
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:  #model in local path
                path = pretrained
            self.set_state_dict(paddle.load(path))

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.stop_gradient = True

        if self.frozen_stages >= 1 and self.ape:
            self.absolute_pos_embed.stop_gradient = True

        if self.frozen_stages >= 2:
            self.pos_drop.eval()
            for i in range(0, self.frozen_stages - 1):
                m = self.layers[i]
                m.eval()
                for param in m.parameters():
                    param.stop_gradient = True

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            zeros_(m.bias)
            ones_(m.weight)

    def forward(self, x):
        """Forward function."""
        x = self.patch_embed(x['image'])
        B, _, Wh, Ww = x.shape
        if self.ape:
            # interpolate the position embedding to the corresponding size
            absolute_pos_embed = F.interpolate(
                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
        else:
            x = x.flatten(2).transpose([0, 2, 1])
        x = self.pos_drop(x)
        outs = []
        for i in range(self.num_layers):
            layer = self.layers[i]
            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                x_out = norm_layer(x_out)
                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
                    (0, 3, 1, 2))
                outs.append(out)

        return outs

    @property
    def out_shape(self):
        out_strides = [4, 8, 16, 32]
        return [
            ShapeSpec(
                channels=self.num_features[i], stride=out_strides[i])
            for i in self.out_indices
        ]


================================================
FILE: ppdet/modeling/backbones/trans_encoder.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import ReLU, Swish, GELU
import math

from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec

__all__ = ['TransEncoder']


class BertEmbeddings(nn.Layer):
    def __init__(self, word_size, position_embeddings_size, word_type_size,
                 hidden_size, dropout_prob):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(
            word_size, hidden_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(position_embeddings_size,
                                                hidden_size)
        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x, token_type_ids=None, position_ids=None):
        seq_len = x.shape[1]
        if position_ids is None:
            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
        if token_type_ids is None:
            token_type_ids = paddle.zeros(x.shape)

        word_embs = self.word_embeddings(x)
        position_embs = self.position_embeddings(position_ids)
        token_type_embs = self.token_type_embeddings(token_type_ids)

        embs_cmb = word_embs + position_embs + token_type_embs
        embs_out = self.layernorm(embs_cmb)
        embs_out = self.dropout(embs_out)
        return embs_out


class BertSelfAttention(nn.Layer):
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 output_attentions=False):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                "The hidden_size must be a multiple of the number of attention "
                "heads, but got {} % {} != 0" %
                (hidden_size, num_attention_heads))

        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(attention_probs_dropout_prob)
        self.output_attentions = output_attentions

    def forward(self, x, attention_mask, head_mask=None):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        query_dim1, query_dim2 = query.shape[:-1]
        new_shape = [
            query_dim1, query_dim2, self.num_attention_heads,
            self.attention_head_size
        ]
        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))

        attention = paddle.matmul(query,
                                  key) / math.sqrt(self.attention_head_size)
        attention = attention + attention_mask
        attention_value = F.softmax(attention, axis=-1)
        attention_value = self.dropout(attention_value)

        if head_mask is not None:
            attention_value = attention_value * head_mask

        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
                                                                        3))
        ctx_dim1, ctx_dim2 = context.shape[:-2]
        new_context_shape = [
            ctx_dim1,
            ctx_dim2,
            self.all_head_size,
        ]
        context = context.reshape(new_context_shape)

        if self.output_attentions:
            return (context, attention_value)
        else:
            return (context, )


class BertAttention(nn.Layer):
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 fc_dropout_prob,
                 output_attentions=False):
        super(BertAttention, self).__init__()
        self.bert_selfattention = BertSelfAttention(
            hidden_size, num_attention_heads, attention_probs_dropout_prob,
            output_attentions)
        self.fc = nn.Linear(hidden_size, hidden_size)
        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
        self.dropout = nn.Dropout(fc_dropout_prob)

    def forward(self, x, attention_mask, head_mask=None):
        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
        features = self.fc(attention_feats[0])
        features = self.dropout(features)
        features = self.layernorm(features + x)
        if len(attention_feats) == 2:
            return (features, attention_feats[1])
        else:
            return (features, )


class BertFeedForward(nn.Layer):
    def __init__(self,
                 hidden_size,
                 intermediate_size,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 fc_dropout_prob,
                 act_fn='ReLU',
                 output_attentions=False):
        super(BertFeedForward, self).__init__()
        self.fc1 = nn.Linear(hidden_size, intermediate_size)
        self.act_fn = eval(act_fn)
        self.fc2 = nn.Linear(intermediate_size, hidden_size)
        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
        self.dropout = nn.Dropout(fc_dropout_prob)

    def forward(self, x):
        features = self.fc1(x)
        features = self.act_fn(features)
        features = self.fc2(features)
        features = self.dropout(features)
        features = self.layernorm(features + x)
        return features


class BertLayer(nn.Layer):
    def __init__(self,
                 hidden_size,
                 intermediate_size,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 fc_dropout_prob,
                 act_fn='ReLU',
                 output_attentions=False):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(hidden_size, num_attention_heads,
                                       attention_probs_dropout_prob,
                                       output_attentions)
        self.feed_forward = BertFeedForward(
            hidden_size, intermediate_size, num_attention_heads,
            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
            output_attentions)

    def forward(self, x, attention_mask, head_mask=None):
        attention_feats = self.attention(x, attention_mask, head_mask)
        features = self.feed_forward(attention_feats[0])
        if len(attention_feats) == 2:
            return (features, attention_feats[1])
        else:
            return (features, )


class BertEncoder(nn.Layer):
    def __init__(self,
                 num_hidden_layers,
                 hidden_size,
                 intermediate_size,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 fc_dropout_prob,
                 act_fn='ReLU',
                 output_attentions=False,
                 output_hidden_feats=False):
        super(BertEncoder, self).__init__()
        self.output_attentions = output_attentions
        self.output_hidden_feats = output_hidden_feats
        self.layers = nn.LayerList([
            BertLayer(hidden_size, intermediate_size, num_attention_heads,
                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
                      output_attentions) for _ in range(num_hidden_layers)
        ])

    def forward(self, x, attention_mask, head_mask=None):
        all_features = (x, )
        all_attentions = ()

        for i, layer in enumerate(self.layers):
            mask = head_mask[i] if head_mask is not None else None
            layer_out = layer(x, attention_mask, mask)

            if self.output_hidden_feats:
                all_features = all_features + (x, )
            x = layer_out[0]
            if self.output_attentions:
                all_attentions = all_attentions + (layer_out[1], )

        outputs = (x, )
        if self.output_hidden_feats:
            outputs += (all_features, )
        if self.output_attentions:
            outputs += (all_attentions, )
        return outputs


class BertPooler(nn.Layer):
    def __init__(self, hidden_size):
        super(BertPooler, self).__init__()
        self.fc = nn.Linear(hidden_size, hidden_size)
        self.act = nn.Tanh()

    def forward(self, x):
        first_token = x[:, 0]
        pooled_output = self.fc(first_token)
        pooled_output = self.act(pooled_output)
        return pooled_output


class METROEncoder(nn.Layer):
    def __init__(self,
                 vocab_size,
                 num_hidden_layers,
                 features_dims,
                 position_embeddings_size,
                 hidden_size,
                 intermediate_size,
                 output_feature_dim,
                 num_attention_heads,
                 attention_probs_dropout_prob,
                 fc_dropout_prob,
                 act_fn='ReLU',
                 output_attentions=False,
                 output_hidden_feats=False,
                 use_img_layernorm=False):
        super(METROEncoder, self).__init__()
        self.img_dims = features_dims
        self.num_hidden_layers = num_hidden_layers
        self.use_img_layernorm = use_img_layernorm
        self.output_attentions = output_attentions
        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
                                        hidden_size, fc_dropout_prob)
        self.encoder = BertEncoder(
            num_hidden_layers, hidden_size, intermediate_size,
            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
            act_fn, output_attentions, output_hidden_feats)
        self.pooler = BertPooler(hidden_size)
        self.position_embeddings = nn.Embedding(position_embeddings_size,
                                                hidden_size)
        self.img_embedding = nn.Linear(
            features_dims, hidden_size, bias_attr=True)
        self.dropout = nn.Dropout(fc_dropout_prob)
        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
        self.residual = nn.Linear(features_dims, output_feature_dim)

        self.apply(self.init_weights)

    def init_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.set_value(
                paddle.normal(
                    mean=0.0, std=0.02, shape=module.weight.shape))
        elif isinstance(module, nn.LayerNorm):
            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
            module.weight.set_value(
                paddle.full(
                    shape=module.weight.shape, fill_value=1.0))
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.set_value(paddle.zeros(shape=module.bias.shape))

    def forward(self, x):
        batchsize, seq_len = x.shape[:2]
        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
        position_ids = paddle.arange(
            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)

        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
        head_mask = [None] * self.num_hidden_layers

        position_embs = self.position_embeddings(position_ids)
        attention_mask = (1.0 - attention_mask) * -10000.0

        img_features = self.img_embedding(x)

        # We empirically observe that adding an additional learnable position embedding leads to more stable training
        embeddings = position_embs + img_features
        if self.use_img_layernorm:
            embeddings = self.layernorm(embeddings)
        embeddings = self.dropout(embeddings)

        encoder_outputs = self.encoder(
            embeddings, attention_mask, head_mask=head_mask)

        pred_score = self.cls_head(encoder_outputs[0])
        res_img_feats = self.residual(x)
        pred_score = pred_score + res_img_feats

        if self.output_attentions and self.output_hidden_feats:
            return pred_score, encoder_outputs[1], encoder_outputs[-1]
        else:
            return pred_score


def gelu(x):
    """Implementation of the gelu activation function.
        https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))


@register
class TransEncoder(nn.Layer):
    def __init__(self,
                 vocab_size=30522,
                 num_hidden_layers=4,
                 num_attention_heads=4,
                 position_embeddings_size=512,
                 intermediate_size=3072,
                 input_feat_dim=[2048, 512, 128],
                 hidden_feat_dim=[1024, 256, 128],
                 attention_probs_dropout_prob=0.1,
                 fc_dropout_prob=0.1,
                 act_fn='gelu',
                 output_attentions=False,
                 output_hidden_feats=False):
        super(TransEncoder, self).__init__()
        output_feat_dim = input_feat_dim[1:] + [3]
        trans_encoder = []
        for i in range(len(output_feat_dim)):
            features_dims = input_feat_dim[i]
            output_feature_dim = output_feat_dim[i]
            hidden_size = hidden_feat_dim[i]

            # init a transformer encoder and append it to a list
            assert hidden_size % num_attention_heads == 0
            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
                                 position_embeddings_size, hidden_size,
                                 intermediate_size, output_feature_dim,
                                 num_attention_heads,
                                 attention_probs_dropout_prob, fc_dropout_prob,
                                 act_fn, output_attentions, output_hidden_feats)
            trans_encoder.append(model)
        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)

    def forward(self, x):
        out = self.trans_encoder(x)
        return out


================================================
FILE: ppdet/modeling/backbones/transformer_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddle.nn.initializer import TruncatedNormal, Constant, Assign

# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)


# Common Layers
def drop_path(x, drop_prob=0., training=False):
    """
        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
    random_tensor = paddle.floor(random_tensor)  # binarize
    output = x.divide(keep_prob) * random_tensor
    return output


class DropPath(nn.Layer):
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Identity(nn.Layer):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input


# common funcs


def to_2tuple(x):
    if isinstance(x, (list, tuple)):
        return x
    return tuple([x] * 2)


def add_parameter(layer, datas, name=None):
    parameter = layer.create_parameter(
        shape=(datas.shape), default_initializer=Assign(datas))
    if name:
        layer.add_parameter(name, parameter)
    return parameter


def window_partition(x, window_size):
    """
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.
    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    """
    B, H, W, C = x.shape

    pad_h = (window_size - H % window_size) % window_size
    pad_w = (window_size - W % window_size) % window_size
    x = F.pad(x.transpose([0, 3, 1, 2]),
              paddle.to_tensor(
                  [0, int(pad_w), 0, int(pad_h)],
                  dtype='int32')).transpose([0, 2, 3, 1])
    Hp, Wp = H + pad_h, W + pad_w

    num_h, num_w = Hp // window_size, Wp // window_size

    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
        [-1, window_size, window_size, C])
    return windows, (Hp, Wp), (num_h, num_w)


def window_unpartition(x, pad_hw, num_hw, hw):
    """
    Window unpartition into original sequences and removing padding.
    Args:
        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.
    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    """
    Hp, Wp = pad_hw
    num_h, num_w = num_hw
    H, W = hw
    B, window_size, _, C = x.shape
    B = B // (num_h * num_w)
    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])

    return x[:, :H, :W, :]


================================================
FILE: ppdet/modeling/backbones/vgg.py
================================================
from __future__ import division

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn import Conv2D, MaxPool2D
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['VGG']

VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]}


class ConvBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 groups,
                 pool_size=2,
                 pool_stride=2,
                 pool_padding=0,
                 name=None):
        super(ConvBlock, self).__init__()

        self.groups = groups
        self.conv0 = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=1,
            padding=1)
        self.conv_out_list = []
        for i in range(1, groups):
            conv_out = self.add_sublayer(
                'conv{}'.format(i),
                Conv2D(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    stride=1,
                    padding=1))
            self.conv_out_list.append(conv_out)

        self.pool = MaxPool2D(
            kernel_size=pool_size,
            stride=pool_stride,
            padding=pool_padding,
            ceil_mode=True)

    def forward(self, inputs):
        out = self.conv0(inputs)
        out = F.relu(out)
        for conv_i in self.conv_out_list:
            out = conv_i(out)
            out = F.relu(out)
        pool = self.pool(out)
        return out, pool


class ExtraBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 padding,
                 stride,
                 kernel_size,
                 name=None):
        super(ExtraBlock, self).__init__()

        self.conv0 = Conv2D(
            in_channels=in_channels,
            out_channels=mid_channels,
            kernel_size=1,
            stride=1,
            padding=0)
        self.conv1 = Conv2D(
            in_channels=mid_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding)

    def forward(self, inputs):
        out = self.conv0(inputs)
        out = F.relu(out)
        out = self.conv1(out)
        out = F.relu(out)
        return out


class L2NormScale(nn.Layer):
    def __init__(self, num_channels, scale=1.0):
        super(L2NormScale, self).__init__()
        self.scale = self.create_parameter(
            attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)),
            shape=[num_channels])

    def forward(self, inputs):
        out = F.normalize(inputs, axis=1, epsilon=1e-10)
        # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
        #     out) * out
        out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out
        return out


@register
@serializable
class VGG(nn.Layer):
    def __init__(self,
                 depth=16,
                 normalizations=[20., -1, -1, -1, -1, -1],
                 extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3],
                                      [128, 256, 0, 1, 3],
                                      [128, 256, 0, 1, 3]]):
        super(VGG, self).__init__()

        assert depth in [16, 19], \
                "depth as 16/19 supported currently, but got {}".format(depth)
        self.depth = depth
        self.groups = VGG_cfg[depth]
        self.normalizations = normalizations
        self.extra_block_filters = extra_block_filters

        self._out_channels = []

        self.conv_block_0 = ConvBlock(
            3, 64, self.groups[0], 2, 2, 0, name="conv1_")
        self.conv_block_1 = ConvBlock(
            64, 128, self.groups[1], 2, 2, 0, name="conv2_")
        self.conv_block_2 = ConvBlock(
            128, 256, self.groups[2], 2, 2, 0, name="conv3_")
        self.conv_block_3 = ConvBlock(
            256, 512, self.groups[3], 2, 2, 0, name="conv4_")
        self.conv_block_4 = ConvBlock(
            512, 512, self.groups[4], 3, 1, 1, name="conv5_")
        self._out_channels.append(512)

        self.fc6 = Conv2D(
            in_channels=512,
            out_channels=1024,
            kernel_size=3,
            stride=1,
            padding=6,
            dilation=6)
        self.fc7 = Conv2D(
            in_channels=1024,
            out_channels=1024,
            kernel_size=1,
            stride=1,
            padding=0)
        self._out_channels.append(1024)

        # extra block
        self.extra_convs = []
        last_channels = 1024
        for i, v in enumerate(self.extra_block_filters):
            assert len(v) == 5, "extra_block_filters size not fix"
            extra_conv = self.add_sublayer("conv{}".format(6 + i),
                                           ExtraBlock(last_channels, v[0], v[1],
                                                      v[2], v[3], v[4]))
            last_channels = v[1]
            self.extra_convs.append(extra_conv)
            self._out_channels.append(last_channels)

        self.norms = []
        for i, n in enumerate(self.normalizations):
            if n != -1:
                norm = self.add_sublayer("norm{}".format(i),
                                         L2NormScale(
                                             self.extra_block_filters[i][1], n))
            else:
                norm = None
            self.norms.append(norm)

    def forward(self, inputs):
        outputs = []

        conv, pool = self.conv_block_0(inputs['image'])
        conv, pool = self.conv_block_1(pool)
        conv, pool = self.conv_block_2(pool)
        conv, pool = self.conv_block_3(pool)
        outputs.append(conv)

        conv, pool = self.conv_block_4(pool)
        out = self.fc6(pool)
        out = F.relu(out)
        out = self.fc7(out)
        out = F.relu(out)
        outputs.append(out)

        if not self.extra_block_filters:
            return outputs

        # extra block
        for extra_conv in self.extra_convs:
            out = extra_conv(out)
            outputs.append(out)

        for i, n in enumerate(self.normalizations):
            if n != -1:
                outputs[i] = self.norms[i](outputs[i])

        return outputs

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/backbones/vision_transformer.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddle.nn.initializer import Constant

from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable

from .transformer_utils import zeros_, DropPath, Identity


class Mlp(nn.Layer):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.,
                 window_size=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)

        if qkv_bias:
            self.q_bias = self.create_parameter(
                shape=([dim]), default_initializer=zeros_)
            self.v_bias = self.create_parameter(
                shape=([dim]), default_initializer=zeros_)
        else:
            self.q_bias = None
            self.v_bias = None
        if window_size:
            self.window_size = window_size
            self.num_relative_distance = (2 * window_size[0] - 1) * (
                2 * window_size[1] - 1) + 3
            self.relative_position_bias_table = self.create_parameter(
                shape=(self.num_relative_distance, num_heads),
                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
            # cls to token & token 2 cls & cls to cls

            # get pair-wise relative position index for each token inside the window
            coords_h = paddle.arange(window_size[0])
            coords_w = paddle.arange(window_size[1])
            coords = paddle.stack(paddle.meshgrid(
                [coords_h, coords_w]))  # 2, Wh, Ww
            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
            )

            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
            relative_coords = relative_coords.transpose(
                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
            relative_coords[:, :, 0] += window_size[
                0] - 1  # shift to start from 0
            relative_coords[:, :, 1] += window_size[1] - 1
            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
            relative_position_index = \
                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
            relative_position_index[1:, 1:] = relative_coords.sum(
                -1)  # Wh*Ww, Wh*Ww
            relative_position_index[0, 0:] = self.num_relative_distance - 3
            relative_position_index[0:, 0] = self.num_relative_distance - 2
            relative_position_index[0, 0] = self.num_relative_distance - 1

            self.register_buffer("relative_position_index",
                                 relative_position_index)
            # trunc_normal_(self.relative_position_bias_table, std=.0)
        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, rel_pos_bias=None):
        x_shape = x.shape
        N, C = x_shape[1], x_shape[2]

        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = paddle.concat(
                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)

        qkv = qkv.reshape((-1, N, 3, self.num_heads,
                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale

        if self.relative_position_bias_table is not None:
            relative_position_bias = self.relative_position_bias_table[
                self.relative_position_index.reshape([-1])].reshape([
                    self.window_size[0] * self.window_size[1] + 1,
                    self.window_size[0] * self.window_size[1] + 1, -1
                ])  # Wh*Ww,Wh*Ww,nH
            relative_position_bias = relative_position_bias.transpose(
                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
            attn = attn + relative_position_bias.unsqueeze(0)
        if rel_pos_bias is not None:
            attn = attn + rel_pos_bias

        attn = nn.functional.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)

        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Block(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 window_size=None,
                 init_values=None,
                 act_layer=nn.GELU,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
            window_size=window_size)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)
        if init_values is not None:
            self.gamma_1 = self.create_parameter(
                shape=([dim]), default_initializer=Constant(value=init_values))
            self.gamma_2 = self.create_parameter(
                shape=([dim]), default_initializer=Constant(value=init_values))
        else:
            self.gamma_1, self.gamma_2 = None, None

    def forward(self, x, rel_pos_bias=None):

        if self.gamma_1 is None:
            x = x + self.drop_path(
                self.attn(
                    self.norm1(x), rel_pos_bias=rel_pos_bias))
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.gamma_1 * self.attn(
                self.norm1(x), rel_pos_bias=rel_pos_bias))
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    """

    def __init__(self,
                 img_size=[224, 224],
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768):
        super().__init__()
        self.num_patches_w = img_size[0] // patch_size
        self.num_patches_h = img_size[1] // patch_size

        num_patches = self.num_patches_w * self.num_patches_h
        self.patch_shape = (img_size[0] // patch_size,
                            img_size[1] // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2D(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    @property
    def num_patches_in_h(self):
        return self.img_size[1] // self.patch_size

    @property
    def num_patches_in_w(self):
        return self.img_size[0] // self.patch_size

    def forward(self, x, mask=None):
        B, C, H, W = x.shape
        return self.proj(x)


class RelativePositionBias(nn.Layer):
    def __init__(self, window_size, num_heads):
        super().__init__()
        self.window_size = window_size
        self.num_relative_distance = (2 * window_size[0] - 1) * (
            2 * window_size[1] - 1) + 3
        self.relative_position_bias_table = self.create_parameter(
            shape=(self.num_relative_distance, num_heads),
            default_initialize=zeros_)
        # cls to token & token 2 cls & cls to cls

        # get pair-wise relative position index for each token inside the window
        coords_h = paddle.arange(window_size[0])
        coords_w = paddle.arange(window_size[1])
        coords = paddle.stack(paddle.meshgrid(
            [coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = coords.flatten(1)  # 2, Wh*Ww

        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.transpos(
            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
        relative_position_index = \
            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
        relative_position_index[1:, 1:] = relative_coords.sum(
            -1)  # Wh*Ww, Wh*Ww
        relative_position_index[0, 0:] = self.num_relative_distance - 3
        relative_position_index[0:, 0] = self.num_relative_distance - 2
        relative_position_index[0, 0] = self.num_relative_distance - 1
        self.register_buffer("relative_position_index", relative_position_index)

    def forward(self):
        relative_position_bias = \
            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
                 self.window_size[0] * self.window_size[1] + 1,
                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww


def get_sinusoid_encoding_table(n_position, d_hid, token=False):
    ''' Sinusoid position encoding table '''

    def get_position_angle_vec(position):
        return [
            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
            for hid_j in range(d_hid)
        ]

    sinusoid_table = np.array(
        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    if token:
        sinusoid_table = np.concatenate(
            [sinusoid_table, np.zeros([1, d_hid])], dim=0)

    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)


@register
@serializable
class VisionTransformer(nn.Layer):
    """ Vision Transformer with support for patch input
    """

    def __init__(self,
                 img_size=[672, 1092],
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 norm_layer='nn.LayerNorm',
                 init_values=None,
                 use_rel_pos_bias=False,
                 use_shared_rel_pos_bias=False,
                 epsilon=1e-5,
                 final_norm=False,
                 pretrained=None,
                 out_indices=[3, 5, 7, 11],
                 use_abs_pos_emb=False,
                 use_sincos_pos_emb=True,
                 with_fpn=True,
                 num_fpn_levels=4,
                 use_checkpoint=False,
                 **args):
        super().__init__()
        self.img_size = img_size
        self.embed_dim = embed_dim
        self.with_fpn = with_fpn
        self.use_checkpoint = use_checkpoint
        self.use_sincos_pos_emb = use_sincos_pos_emb
        self.use_rel_pos_bias = use_rel_pos_bias
        self.final_norm = final_norm
        self.out_indices = out_indices
        self.num_fpn_levels = num_fpn_levels

        if use_checkpoint:
            paddle.seed(0)

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim)

        self.pos_w = self.patch_embed.num_patches_in_w
        self.pos_h = self.patch_embed.num_patches_in_h

        self.cls_token = self.create_parameter(
            shape=(1, 1, embed_dim),
            default_initializer=paddle.nn.initializer.Constant(value=0.))

        if use_abs_pos_emb:
            self.pos_embed = self.create_parameter(
                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
                default_initializer=paddle.nn.initializer.TruncatedNormal(
                    std=.02))
        elif use_sincos_pos_emb:
            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)

            self.pos_embed = pos_embed
            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
            self.pos_embed.set_value(pos_embed.numpy())
            self.pos_embed.stop_gradient = True

        else:
            self.pos_embed = None

        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(
                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        dpr = np.linspace(0, drop_path_rate, depth)

        self.blocks = nn.LayerList([
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                init_values=init_values,
                window_size=self.patch_embed.patch_shape
                if use_rel_pos_bias else None,
                epsilon=epsilon) for i in range(depth)
        ])

        self.pretrained = pretrained
        self.init_weight()

        assert len(out_indices) <= 4, ''
        self.out_indices = out_indices
        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
            patch_size for _ in range(len(out_indices))
        ]

        self.norm = Identity()

        if self.with_fpn:
            assert num_fpn_levels <= 4, ''
            self.init_fpn(
                embed_dim=embed_dim,
                patch_size=patch_size, )

    def init_weight(self):
        pretrained = self.pretrained

        if pretrained:
            if 'http' in pretrained:  #URL
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:  #model in local path
                path = pretrained

            load_state_dict = paddle.load(path)
            model_state_dict = self.state_dict()
            pos_embed_name = "pos_embed"

            if pos_embed_name in load_state_dict.keys():
                load_pos_embed = paddle.to_tensor(
                    load_state_dict[pos_embed_name], dtype="float32")
                if self.pos_embed.shape != load_pos_embed.shape:
                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
                        load_pos_embed, (pos_size, pos_size),
                        (self.pos_h, self.pos_w))

                    # self.set_state_dict(model_state_dict)
                    load_state_dict[pos_embed_name] = model_state_dict[
                        pos_embed_name]

                    print("Load pos_embed and resize it from {} to {} .".format(
                        load_pos_embed.shape, self.pos_embed.shape))

            self.set_state_dict(load_state_dict)
            print("Load load_state_dict....")

    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
        if patch_size == 16:
            self.fpn1 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
                nn.BatchNorm2D(embed_dim),
                nn.GELU(),
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn2 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn3 = Identity()

            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
        elif patch_size == 8:
            self.fpn1 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn2 = Identity()

            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )

            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )

        if not out_with_norm:
            self.norm = Identity()
        else:
            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)

    def interpolate_pos_encoding(self, x, w, h):
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        w0 = w // self.patch_embed.patch_size
        h0 = h // self.patch_embed.patch_size
        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
            return self.pos_embed
        class_pos_embed = self.pos_embed[:, 0]
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        # w0, h0 = w0 + 0.1, h0 + 0.1
        # patch_pos_embed = nn.functional.interpolate(
        #     patch_pos_embed.reshape([
        #         1, self.patch_embed.num_patches_w,
        #         self.patch_embed.num_patches_h, dim
        #     ]).transpose((0, 3, 1, 2)),
        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
        #                   h0 / self.patch_embed.num_patches_h),
        #     mode='bicubic', )

        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape([
                1, self.patch_embed.num_patches_w,
                self.patch_embed.num_patches_h, dim
            ]).transpose((0, 3, 1, 2)),
            (w0, h0),
            mode='bicubic', )

        assert int(w0) == patch_pos_embed.shape[-2] and int(
            h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.transpose(
            (0, 2, 3, 1)).reshape([1, -1, dim])
        return paddle.concat(
            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)

    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
        """
        Resize pos_embed weight.
        Args:
            pos_embed (Tensor): the pos_embed weight
            old_hw (list[int]): the height and width of old pos_embed
            new_hw (list[int]): the height and width of new pos_embed
        Returns:
            Tensor: the resized pos_embed weight
        """
        cls_pos_embed = pos_embed[:, :1, :]
        pos_embed = pos_embed[:, 1:, :]

        pos_embed = pos_embed.transpose([0, 2, 1])
        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
        pos_embed = F.interpolate(
            pos_embed, new_hw, mode='bicubic', align_corners=False)
        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)

        return pos_embed

    def build_2d_sincos_position_embedding(
            self,
            embed_dim=768,
            temperature=10000., ):
        h, w = self.patch_embed.patch_shape
        grid_w = paddle.arange(w, dtype=paddle.float32)
        grid_h = paddle.arange(h, dtype=paddle.float32)
        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
        pos_dim = embed_dim // 4
        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
        omega = 1. / (temperature**omega)

        out_w = grid_w.flatten()[..., None] @omega[None]
        out_h = grid_h.flatten()[..., None] @omega[None]

        pos_emb = paddle.concat(
            [
                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
                paddle.cos(out_h)
            ],
            axis=1)[None, :, :]

        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
        # pos_embed.stop_gradient = True

        return pos_embed

    def forward(self, x):
        x = x['image'] if isinstance(x, dict) else x
        _, _, h, w = x.shape

        x = self.patch_embed(x)

        B, D, Hp, Wp = x.shape  # b * c * h * w

        cls_tokens = self.cls_token.expand(
            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
        x = paddle.concat([cls_tokens, x], axis=1)

        if self.pos_embed is not None:
            # x = x + self.interpolate_pos_encoding(x, w, h)
            x = x + self.interpolate_pos_encoding(x, h, w)

        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias(
        ) if self.rel_pos_bias is not None else None

        feats = []
        for idx, blk in enumerate(self.blocks):
            if self.use_checkpoint and self.training:
                x = paddle.distributed.fleet.utils.recompute(
                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
            else:
                x = blk(x, rel_pos_bias)

            if idx in self.out_indices:
                xp = paddle.reshape(
                    paddle.transpose(
                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
                    shape=[B, D, Hp, Wp])
                feats.append(xp)

        if self.with_fpn:
            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
                -self.num_fpn_levels:]
            assert len(fpns) == len(feats) or len(feats) == 1, ''
            outputs = []
            for i, m in enumerate(fpns):
                outputs.append(
                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))

            return outputs

        return feats

    @property
    def num_layers(self):
        return len(self.blocks)

    @property
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=c, stride=s)
            for c, s in zip(self.out_channels, self.out_strides)
        ]


================================================
FILE: ppdet/modeling/backbones/vit_mae.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
import math
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant, TruncatedNormal

from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable

from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
                                window_unpartition)
from ..initializer import linear_init_

__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']


class Mlp(nn.Layer):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer='nn.GELU',
                 drop=0.,
                 lr_factor=1.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(
            in_features,
            hidden_features,
            weight_attr=ParamAttr(learning_rate=lr_factor),
            bias_attr=ParamAttr(learning_rate=lr_factor))
        self.act = eval(act_layer)()
        self.fc2 = nn.Linear(
            hidden_features,
            out_features,
            weight_attr=ParamAttr(learning_rate=lr_factor),
            bias_attr=ParamAttr(learning_rate=lr_factor))
        self.drop = nn.Dropout(drop)

        self._init_weights()

    def _init_weights(self):
        linear_init_(self.fc1)
        linear_init_(self.fc2)

    def forward(self, x):
        x = self.drop(self.act(self.fc1(x)))
        x = self.drop(self.fc2(x))
        return x


class Attention(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 attn_bias=False,
                 attn_drop=0.,
                 proj_drop=0.,
                 use_rel_pos=False,
                 rel_pos_zero_init=True,
                 window_size=None,
                 input_size=None,
                 qk_scale=None,
                 lr_factor=1.0):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = qk_scale or self.head_dim**-0.5
        self.use_rel_pos = use_rel_pos
        self.input_size = input_size
        self.rel_pos_zero_init = rel_pos_zero_init
        self.window_size = window_size
        self.lr_factor = lr_factor

        self.qkv = nn.Linear(
            dim,
            dim * 3,
            weight_attr=ParamAttr(learning_rate=lr_factor),
            bias_attr=ParamAttr(learning_rate=lr_factor)
            if attn_bias else False)
        if qkv_bias:
            self.q_bias = self.create_parameter(
                shape=([dim]), default_initializer=zeros_)
            self.v_bias = self.create_parameter(
                shape=([dim]), default_initializer=zeros_)
        else:
            self.q_bias = None
            self.v_bias = None
        self.proj = nn.Linear(
            dim,
            dim,
            weight_attr=ParamAttr(learning_rate=lr_factor),
            bias_attr=ParamAttr(learning_rate=lr_factor))
        self.attn_drop = nn.Dropout(attn_drop)
        if window_size is None:
            self.window_size = self.input_size[0]

        self._init_weights()

    def _init_weights(self):
        linear_init_(self.qkv)
        linear_init_(self.proj)

        if self.use_rel_pos:
            self.rel_pos_h = self.create_parameter(
                [2 * self.window_size - 1, self.head_dim],
                attr=ParamAttr(learning_rate=self.lr_factor),
                default_initializer=Constant(value=0.))
            self.rel_pos_w = self.create_parameter(
                [2 * self.window_size - 1, self.head_dim],
                attr=ParamAttr(learning_rate=self.lr_factor),
                default_initializer=Constant(value=0.))

            if not self.rel_pos_zero_init:
                TruncatedNormal(self.rel_pos_h, std=0.02)
                TruncatedNormal(self.rel_pos_w, std=0.02)

    def get_rel_pos(self, seq_size, rel_pos):
        max_rel_dist = int(2 * seq_size - 1)
        # Interpolate rel pos if needed.
        if rel_pos.shape[0] != max_rel_dist:
            # Interpolate rel pos.
            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
            rel_pos = rel_pos.transpose([0, 2, 1])
            rel_pos_resized = F.interpolate(
                rel_pos,
                size=(max_rel_dist, ),
                mode="linear",
                data_format='NCW')
            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
            rel_pos_resized = rel_pos_resized.transpose([1, 0])
        else:
            rel_pos_resized = rel_pos

        coords = paddle.arange(seq_size, dtype='float32')
        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
        relative_coords += (seq_size - 1)
        relative_coords = relative_coords.astype('int64').flatten()

        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
            [seq_size, seq_size, self.head_dim])

    def add_decomposed_rel_pos(self, attn, q, h, w):
        """
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        Args:
            attn (Tensor): attention map.
            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        Returns:
            attn (Tensor): attention map with added relative positional embeddings.
        """
        Rh = self.get_rel_pos(h, self.rel_pos_h)
        Rw = self.get_rel_pos(w, self.rel_pos_w)

        B, _, dim = q.shape
        r_q = q.reshape([B, h, w, dim])
        # bhwc, hch->bhwh1
        # bwhc, wcw->bhw1w
        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)

        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
        return attn.reshape([B, h * w, h * w])

    def forward(self, x):
        B, H, W, C = x.shape

        if self.q_bias is not None:
            qkv_bias = paddle.concat(
                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
        else:
            qkv = self.qkv(x).reshape(
                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
                    [2, 0, 3, 1, 4]).reshape(
                        [3, B * self.num_heads, H * W, self.head_dim])

        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale

        if self.use_rel_pos:
            attn = self.add_decomposed_rel_pos(attn, q, H, W)

        attn = F.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)
        x = attn.matmul(v).reshape(
            [B, self.num_heads, H * W, self.head_dim]).transpose(
                [0, 2, 1, 3]).reshape([B, H, W, C])
        x = self.proj(x)
        return x


class Block(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 attn_bias=False,
                 qk_scale=None,
                 init_values=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 use_rel_pos=True,
                 rel_pos_zero_init=True,
                 window_size=None,
                 input_size=None,
                 act_layer='nn.GELU',
                 norm_layer='nn.LayerNorm',
                 lr_factor=1.0,
                 epsilon=1e-5):
        super().__init__()
        self.window_size = window_size

        self.norm1 = eval(norm_layer)(dim,
                                      weight_attr=ParamAttr(
                                          learning_rate=lr_factor,
                                          regularizer=L2Decay(0.0)),
                                      bias_attr=ParamAttr(
                                          learning_rate=lr_factor,
                                          regularizer=L2Decay(0.0)),
                                      epsilon=epsilon)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_bias=attn_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
            use_rel_pos=use_rel_pos,
            rel_pos_zero_init=rel_pos_zero_init,
            window_size=window_size,
            input_size=input_size,
            lr_factor=lr_factor)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        self.norm2 = eval(norm_layer)(dim,
                                      weight_attr=ParamAttr(
                                          learning_rate=lr_factor,
                                          regularizer=L2Decay(0.0)),
                                      bias_attr=ParamAttr(
                                          learning_rate=lr_factor,
                                          regularizer=L2Decay(0.0)),
                                      epsilon=epsilon)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=int(dim * mlp_ratio),
                       act_layer=act_layer,
                       drop=drop,
                       lr_factor=lr_factor)
        if init_values is not None:
            self.gamma_1 = self.create_parameter(
                shape=([dim]), default_initializer=Constant(value=init_values))
            self.gamma_2 = self.create_parameter(
                shape=([dim]), default_initializer=Constant(value=init_values))
        else:
            self.gamma_1, self.gamma_2 = None, None

    def forward(self, x):
        y = self.norm1(x)
        if self.window_size is not None:
            y, pad_hw, num_hw = window_partition(y, self.window_size)
        y = self.attn(y)
        if self.gamma_1 is not None:
            y = self.gamma_1 * y

        if self.window_size is not None:
            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
        x = x + self.drop_path(y)
        if self.gamma_2 is None:
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))

        return x


class PatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    """

    def __init__(self,
                 img_size=(224, 224),
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 lr_factor=0.01):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.proj = nn.Conv2D(
            in_chans,
            embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
            weight_attr=ParamAttr(learning_rate=lr_factor),
            bias_attr=ParamAttr(learning_rate=lr_factor))

    @property
    def num_patches_in_h(self):
        return self.img_size[1] // self.patch_size

    @property
    def num_patches_in_w(self):
        return self.img_size[0] // self.patch_size

    def forward(self, x):
        out = self.proj(x)
        return out


@register
@serializable
class VisionTransformer2D(nn.Layer):
    """ Vision Transformer with support for patch input
    """

    def __init__(self,
                 img_size=(1024, 1024),
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4,
                 qkv_bias=False,
                 attn_bias=False,
                 qk_scale=None,
                 init_values=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 act_layer='nn.GELU',
                 norm_layer='nn.LayerNorm',
                 lr_decay_rate=1.0,
                 global_attn_indexes=(2, 5, 8, 11),
                 use_abs_pos=False,
                 use_rel_pos=False,
                 use_abs_pos_emb=False,
                 use_sincos_pos_emb=False,
                 rel_pos_zero_init=True,
                 epsilon=1e-5,
                 final_norm=False,
                 pretrained=None,
                 window_size=None,
                 out_indices=(11, ),
                 with_fpn=False,
                 use_checkpoint=False,
                 *args,
                 **kwargs):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.depth = depth
        self.global_attn_indexes = global_attn_indexes
        self.epsilon = epsilon
        self.with_fpn = with_fpn
        self.use_checkpoint = use_checkpoint

        self.patch_h = img_size[0] // patch_size
        self.patch_w = img_size[1] // patch_size
        self.num_patches = self.patch_h * self.patch_w
        self.use_abs_pos = use_abs_pos
        self.use_abs_pos_emb = use_abs_pos_emb

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim)

        dpr = np.linspace(0, drop_path_rate, depth)
        if use_checkpoint:
            paddle.seed(0)

        if use_abs_pos_emb:
            self.pos_w = self.patch_embed.num_patches_in_w
            self.pos_h = self.patch_embed.num_patches_in_h
            self.pos_embed = self.create_parameter(
                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
                default_initializer=paddle.nn.initializer.TruncatedNormal(
                    std=.02))
        elif use_sincos_pos_emb:
            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
                                                              self.patch_w)

            self.pos_embed = pos_embed
            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
            self.pos_embed.set_value(pos_embed.numpy())
            self.pos_embed.stop_gradient = True
        else:
            self.pos_embed = None

        self.blocks = nn.LayerList([
            Block(
                embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                attn_bias=attn_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                use_rel_pos=use_rel_pos,
                rel_pos_zero_init=rel_pos_zero_init,
                window_size=None
                if i in self.global_attn_indexes else window_size,
                input_size=[self.patch_h, self.patch_w],
                act_layer=act_layer,
                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
                norm_layer=norm_layer,
                init_values=init_values,
                epsilon=epsilon) for i in range(depth)
        ])

        assert len(out_indices) <= 4, 'out_indices out of bound'
        self.out_indices = out_indices
        self.pretrained = pretrained
        self.init_weight()

        self.out_channels = [embed_dim for _ in range(len(out_indices))]
        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
            patch_size for _ in range(len(out_indices))
        ]
        self.norm = Identity()
        if self.with_fpn:
            self.init_fpn(
                embed_dim=embed_dim,
                patch_size=patch_size,
                out_with_norm=final_norm)

    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
        return lr_decay_rate**(self.depth - layer_id)

    def init_weight(self):
        pretrained = self.pretrained
        if pretrained:
            if 'http' in pretrained:
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:
                path = pretrained

            load_state_dict = paddle.load(path)
            model_state_dict = self.state_dict()
            pos_embed_name = "pos_embed"

            if pos_embed_name in load_state_dict.keys(
            ) and self.use_abs_pos_emb:
                load_pos_embed = paddle.to_tensor(
                    load_state_dict[pos_embed_name], dtype="float32")
                if self.pos_embed.shape != load_pos_embed.shape:
                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
                        load_pos_embed, (pos_size, pos_size),
                        (self.pos_h, self.pos_w))

                    # self.set_state_dict(model_state_dict)
                    load_state_dict[pos_embed_name] = model_state_dict[
                        pos_embed_name]

                    print("Load pos_embed and resize it from {} to {} .".format(
                        load_pos_embed.shape, self.pos_embed.shape))

            self.set_state_dict(load_state_dict)
            print("Load load_state_dict....")

    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
        if patch_size == 16:
            self.fpn1 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
                nn.BatchNorm2D(embed_dim),
                nn.GELU(),
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn2 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn3 = Identity()

            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
        elif patch_size == 8:
            self.fpn1 = nn.Sequential(
                nn.Conv2DTranspose(
                    embed_dim, embed_dim, kernel_size=2, stride=2), )

            self.fpn2 = Identity()

            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )

            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )

        if not out_with_norm:
            self.norm = Identity()
        else:
            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)

    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
        """
        Resize pos_embed weight.
        Args:
            pos_embed (Tensor): the pos_embed weight
            old_hw (list[int]): the height and width of old pos_embed
            new_hw (list[int]): the height and width of new pos_embed
        Returns:
            Tensor: the resized pos_embed weight
        """
        cls_pos_embed = pos_embed[:, :1, :]
        pos_embed = pos_embed[:, 1:, :]

        pos_embed = pos_embed.transpose([0, 2, 1])
        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
        pos_embed = F.interpolate(
            pos_embed, new_hw, mode='bicubic', align_corners=False)
        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)

        return pos_embed

    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
        grid_y, grid_x = paddle.meshgrid(
            paddle.arange(
                h, dtype=paddle.float32),
            paddle.arange(
                w, dtype=paddle.float32))
        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
        pos_dim = self.embed_dim // 4
        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
        omega = (1. / (temperature**omega)).unsqueeze(0)

        out_x = grid_x.reshape([-1, 1]).matmul(omega)
        out_y = grid_y.reshape([-1, 1]).matmul(omega)

        pos_emb = paddle.concat(
            [
                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
                paddle.cos(out_x)
            ],
            axis=1)

        return pos_emb.reshape([1, h, w, self.embed_dim])

    def forward(self, inputs):
        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
        B, Hp, Wp, _ = x.shape

        if self.use_abs_pos:
            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)

        if self.use_abs_pos_emb:
            x = x + self.resize_pos_embed(self.pos_embed,
                                          (self.pos_h, self.pos_w), (Hp, Wp))

        feats = []
        for idx, blk in enumerate(self.blocks):
            if self.use_checkpoint and self.training:
                x = paddle.distributed.fleet.utils.recompute(
                    blk, x, **{"preserve_rng_state": True})
            else:
                x = blk(x)
            if idx in self.out_indices:
                feats.append(self.norm(x.transpose([0, 3, 1, 2])))

        if self.with_fpn:
            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
            for i in range(len(feats)):
                feats[i] = fpns[i](feats[i])
        return feats

    @property
    def num_layers(self):
        return len(self.blocks)

    @property
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=c, stride=s)
            for c, s in zip(self.out_channels, self.out_strides)
        ]


class LayerNorm(nn.Layer):
    """
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
    variance normalization over the channel dimension for inputs that have shape
    (batch_size, channels, height, width).    
    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.

    In ViT, we use the nn.LayerNorm
    """

    def __init__(self, normalized_shape, eps=1e-6):
        super().__init__()
        self.weight = self.create_parameter([normalized_shape])
        self.bias = self.create_parameter([normalized_shape])
        self.eps = eps
        self.normalized_shape = (normalized_shape, )

    def forward(self, x):
        u = x.mean(1, keepdim=True)
        s = (x - u).pow(2).mean(1, keepdim=True)
        x = (x - u) / paddle.sqrt(s + self.eps)
        x = self.weight[:, None, None] * x + self.bias[:, None, None]
        return x


@register
@serializable
class SimpleFeaturePyramid(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 spatial_scales,
                 num_levels=4,
                 use_bias=False):
        """
        Args:
            in_channels (list[int]): input channels of each level which can be 
                derived from the output shape of backbone by from_config
            out_channel (int): output channel of each level.
            spatial_scales (list[float]): list of scaling factors to upsample or downsample
                the input features for creating pyramid features which can be derived from 
                the output shape of backbone by from_config
            num_levels (int): number of levels of output features.
            use_bias (bool): whether use bias or not.
        """
        super(SimpleFeaturePyramid, self).__init__()

        self.in_channels = in_channels[0]
        self.out_channels = out_channels
        self.num_levels = num_levels

        self.stages = []
        dim = self.in_channels
        if num_levels == 4:
            scale_factors = [2.0, 1.0, 0.5]
        elif num_levels == 5:
            scale_factors = [4.0, 2.0, 1.0, 0.5]
        else:
            raise NotImplementedError(
                f"num_levels={num_levels} is not supported yet.")

        dim = in_channels[0]
        for idx, scale in enumerate(scale_factors):
            out_dim = dim
            if scale == 4.0:
                layers = [
                    nn.Conv2DTranspose(
                        dim, dim // 2, kernel_size=2, stride=2),
                    nn.LayerNorm(dim // 2),
                    nn.GELU(),
                    nn.Conv2DTranspose(
                        dim // 2, dim // 4, kernel_size=2, stride=2),
                ]
                out_dim = dim // 4
            elif scale == 2.0:
                layers = [
                    nn.Conv2DTranspose(
                        dim, dim // 2, kernel_size=2, stride=2)
                ]
                out_dim = dim // 2
            elif scale == 1.0:
                layers = []
            elif scale == 0.5:
                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]

            layers.extend([
                nn.Conv2D(
                    out_dim,
                    out_channels,
                    kernel_size=1,
                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
                        out_channels,
                        out_channels,
                        kernel_size=3,
                        padding=1,
                        bias_attr=use_bias, ), LayerNorm(out_channels)
            ])
            layers = nn.Sequential(*layers)

            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
            self.add_sublayer(f"simfp_{stage}", layers)
            self.stages.append(layers)

        # top block output feature maps.
        self.top_block = nn.Sequential(
            nn.MaxPool2D(
                kernel_size=1, stride=2, padding=0))

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
            'spatial_scales': [1.0 / i.stride for i in input_shape],
        }

    @property
    def out_shape(self):
        return [
            ShapeSpec(channels=self.out_channels)
            for _ in range(self.num_levels)
        ]

    def forward(self, feats):
        """
        Args:
            x: Tensor of shape (N,C,H,W).
        """
        features = feats[0]
        results = []

        for stage in self.stages:
            results.append(stage(features))

        top_block_in_feature = results[-1]
        results.append(self.top_block(top_block_in_feature))
        assert self.num_levels == len(results)

        return results


================================================
FILE: ppdet/modeling/backbones/vitpose.py
================================================
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
# reference: https://arxiv.org/abs/2010.11929

from collections.abc import Callable

import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
from ppdet.core.workspace import register, serializable

trunc_normal_ = TruncatedNormal(std=.02)


def to_2tuple(x):
    if isinstance(x, (list, tuple)):
        return x
    return tuple([x] * 2)


def drop_path(x, drop_prob=0., training=False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype)
    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
    random_tensor = paddle.floor(random_tensor)  # binarize
    output = x.divide(keep_prob) * random_tensor
    return output


class DropPath(nn.Layer):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Identity(nn.Layer):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input


class Mlp(nn.Layer):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)

        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):

        N, C = x.shape[1:]
        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
                                   self.num_heads)).transpose((2, 0, 3, 1, 4))

        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
        attn = nn.functional.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)

        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
        x = self.proj(x)

        x = self.proj_drop(x)
        return x


class Block(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5):
        super().__init__()
        if isinstance(norm_layer, str):
            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
        elif isinstance(norm_layer, Callable):
            self.norm1 = norm_layer(dim)
        else:
            raise TypeError(
                "The norm_layer must be str or paddle.nn.layer.Layer class")
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        if isinstance(norm_layer, str):
            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
        elif isinstance(norm_layer, Callable):
            self.norm2 = norm_layer(dim)
        else:
            raise TypeError(
                "The norm_layer must be str or paddle.nn.layer.Layer class")
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class PatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 ratio=1):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)

        num_patches = (img_size[1] // patch_size[1]) * (
            img_size[0] // patch_size[0]) * (ratio**2)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2D(
            in_chans,
            embed_dim,
            kernel_size=patch_size,
            stride=(patch_size[0] // ratio),
            padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1)))

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."

        x = self.proj(x)
        return x


@register
@serializable
class ViT(nn.Layer):
    """ Vision Transformer with support for patch input

        This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py),
        the main differences are:
        1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1),
          VisionTransformer dose not
        2.Attention module qkv is standard.but VisionTransformer provide more options
        3.MLP module only one Dropout,and VisionTransformer twice;
        4.VisionTransformer provide fpn layer,but the module does not.
        
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5,
                 ratio=1,
                 pretrained=None,
                 **kwargs):
        super().__init__()

        self.pretrained = pretrained
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            ratio=ratio)
        num_patches = self.patch_embed.num_patches

        self.pos_embed = self.create_parameter(
            shape=(1, num_patches + 1, embed_dim),
            default_initializer=trunc_normal_)
        self.add_parameter("pos_embed", self.pos_embed)

        dpr = np.linspace(0, drop_path_rate, depth, dtype='float32')

        self.blocks = nn.LayerList([
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                epsilon=epsilon) for i in range(depth)
        ])

        self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
        trunc_normal_(self.pos_embed)
        self._init_weights()

    def _init_weights(self):
        pretrained = self.pretrained

        if pretrained:

            if 'http' in pretrained:  #URL
                path = paddle.utils.download.get_weights_path_from_url(
                    pretrained)
            else:  #model in local path
                path = pretrained

            load_state_dict = paddle.load(path)
            self.set_state_dict(load_state_dict)
            print("Load load_state_dict:", path)

    def forward_features(self, x):

        B = x.shape[0]
        x = self.patch_embed(x)
        B, D, Hp, Wp = x.shape
        x = x.flatten(2).transpose([0, 2, 1])
        x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]

        for blk in self.blocks:
            x = blk(x)

        x = self.last_norm(x)
        xp = paddle.reshape(
            paddle.transpose(
                x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp])

        return xp


================================================
FILE: ppdet/modeling/bbox_utils.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import numpy as np


def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
    """Encode bboxes to deltas.
    """
    src_w = src_boxes[:, 2] - src_boxes[:, 0]
    src_h = src_boxes[:, 3] - src_boxes[:, 1]
    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h

    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h

    wx, wy, ww, wh = weights
    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
    dw = ww * paddle.log(tgt_w / src_w)
    dh = wh * paddle.log(tgt_h / src_h)

    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
    return deltas


def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
    Note: return tensor shape [n,1,4]
        If you want to add a reshape, please add after the calling code instead of here.
    """
    clip_scale = math.log(1000.0 / 16)

    widths = boxes[:, 2] - boxes[:, 0]
    heights = boxes[:, 3] - boxes[:, 1]
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    wx, wy, ww, wh = weights
    dx = deltas[:, 0::4] / wx
    dy = deltas[:, 1::4] / wy
    dw = deltas[:, 2::4] / ww
    dh = deltas[:, 3::4] / wh
    # Prevent sending too large values into paddle.exp()
    dw = paddle.clip(dw, max=clip_scale)
    dh = paddle.clip(dh, max=clip_scale)

    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
    pred_h = paddle.exp(dh) * heights.unsqueeze(1)

    pred_boxes = []
    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
    pred_boxes = paddle.stack(pred_boxes, axis=-1)

    if max_shape is not None:
        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
            min=0, max=max_shape[1])
        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
            min=0, max=max_shape[0])
    return pred_boxes


def bbox2delta_v2(src_boxes,
                  tgt_boxes,
                  delta_mean=[0.0, 0.0, 0.0, 0.0],
                  delta_std=[1.0, 1.0, 1.0, 1.0]):
    """Encode bboxes to deltas.
    Modified from bbox2delta() which just use weight parameters to multiply deltas.
    """
    src_w = src_boxes[:, 2] - src_boxes[:, 0]
    src_h = src_boxes[:, 3] - src_boxes[:, 1]
    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h

    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h

    dx = (tgt_ctr_x - src_ctr_x) / src_w
    dy = (tgt_ctr_y - src_ctr_y) / src_h
    dw = paddle.log(tgt_w / src_w)
    dh = paddle.log(tgt_h / src_h)

    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
    deltas = (
        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
    return deltas


def delta2bbox_v2(deltas,
                  boxes,
                  delta_mean=[0.0, 0.0, 0.0, 0.0],
                  delta_std=[1.0, 1.0, 1.0, 1.0],
                  max_shape=None,
                  ctr_clip=32.0):
    """Decode deltas to bboxes.
    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
    Used in YOLOFHead.
    Note: return tensor shape [n,1,4]
        If you want to add a reshape, please add after the calling code instead of here.
    """
    clip_scale = math.log(1000.0 / 16)

    widths = boxes[:, 2] - boxes[:, 0]
    heights = boxes[:, 3] - boxes[:, 1]
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
    dx = deltas[:, 0::4]
    dy = deltas[:, 1::4]
    dw = deltas[:, 2::4]
    dh = deltas[:, 3::4]

    # Prevent sending too large values into paddle.exp()
    dx = dx * widths.unsqueeze(1)
    dy = dy * heights.unsqueeze(1)
    if ctr_clip is not None:
        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
        dw = paddle.clip(dw, max=clip_scale)
        dh = paddle.clip(dh, max=clip_scale)
    else:
        dw = dw.clip(min=-clip_scale, max=clip_scale)
        dh = dh.clip(min=-clip_scale, max=clip_scale)

    pred_ctr_x = dx + ctr_x.unsqueeze(1)
    pred_ctr_y = dy + ctr_y.unsqueeze(1)
    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
    pred_h = paddle.exp(dh) * heights.unsqueeze(1)

    pred_boxes = []
    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
    pred_boxes = paddle.stack(pred_boxes, axis=-1)

    if max_shape is not None:
        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
            min=0, max=max_shape[1])
        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
            min=0, max=max_shape[0])
    return pred_boxes


def expand_bbox(bboxes, scale):
    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5

    w_half *= scale
    h_half *= scale

    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
    bboxes_exp[:, 0] = x_c - w_half
    bboxes_exp[:, 2] = x_c + w_half
    bboxes_exp[:, 1] = y_c - h_half
    bboxes_exp[:, 3] = y_c + h_half

    return bboxes_exp


def clip_bbox(boxes, im_shape):
    h, w = im_shape[0], im_shape[1]
    x1 = boxes[:, 0].clip(0, w)
    y1 = boxes[:, 1].clip(0, h)
    x2 = boxes[:, 2].clip(0, w)
    y2 = boxes[:, 3].clip(0, h)
    return paddle.stack([x1, y1, x2, y2], axis=1)


def nonempty_bbox(boxes, min_size=0, return_mask=False):
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]
    mask = paddle.logical_and(h > min_size, w > min_size)
    if return_mask:
        return mask
    keep = paddle.nonzero(mask).flatten()
    return keep


def bbox_area(boxes):
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


def bbox_overlaps(boxes1, boxes2):
    """
    Calculate overlaps between boxes1 and boxes2

    Args:
        boxes1 (Tensor): boxes with shape [M, 4]
        boxes2 (Tensor): boxes with shape [N, 4]

    Return:
        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
    """
    M = boxes1.shape[0]
    N = boxes2.shape[0]
    if M * N == 0:
        return paddle.zeros([M, N], dtype='float32')
    area1 = bbox_area(boxes1)
    area2 = bbox_area(boxes2)

    xy_max = paddle.minimum(
        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
    xy_min = paddle.maximum(
        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
    width_height = xy_max - xy_min
    width_height = width_height.clip(min=0)
    inter = width_height.prod(axis=2)

    overlaps = paddle.where(inter > 0, inter /
                            (paddle.unsqueeze(area1, 1) + area2 - inter),
                            paddle.zeros_like(inter))
    return overlaps


def batch_bbox_overlaps(bboxes1,
                        bboxes2,
                        mode='iou',
                        is_aligned=False,
                        eps=1e-6):
    """Calculate overlap between two set of bboxes.
    If ``is_aligned `` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.
    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned `` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or "iof" (intersection over
            foreground).
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.
    Returns:
        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
    """
    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
    # Either the boxes are empty or the length of boxes's last dimenstion is 4
    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return paddle.full(batch_shape + (rows, ), 1)
        else:
            return paddle.full(batch_shape + (rows, cols), 1)

    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])

    if is_aligned:
        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]
        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
        overlap = wh[:, 0] * wh[:, 1]

        if mode in ['iou', 'giou']:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == 'giou':
            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
    else:
        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
                            bboxes2[:, :2])  # [B, rows, cols, 2]
        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
                            bboxes2[:, 2:])  # [B, rows, cols, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
        overlap = wh[:, :, 0] * wh[:, :, 1]

        if mode in ['iou', 'giou']:
            union = area1.reshape([rows,1]) \
                    + area2.reshape([1,cols]) - overlap
        else:
            union = area1[:, None]
        if mode == 'giou':
            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
                                         bboxes2[:, :2])
            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
                                         bboxes2[:, 2:])

    eps = paddle.to_tensor([eps])
    union = paddle.maximum(union, eps)
    ious = overlap / union
    if mode in ['iou', 'iof']:
        return ious
    # calculate gious
    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
    enclose_area = paddle.maximum(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return 1 - gious


def xywh2xyxy(box):
    x, y, w, h = box
    x1 = x - w * 0.5
    y1 = y - h * 0.5
    x2 = x + w * 0.5
    y2 = y + h * 0.5
    return [x1, y1, x2, y2]


def make_grid(h, w, dtype):
    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
    return paddle.stack((xv, yv), 2).cast(dtype=dtype)


def decode_yolo(box, anchor, downsample_ratio):
    """decode yolo box

    Args:
        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
        anchor (list): anchor with the shape [na, 2]
        downsample_ratio (int): downsample ratio, default 32
        scale (float): scale, default 1.

    Return:
        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
    """
    x, y, w, h = box
    na, grid_h, grid_w = x.shape[1:4]
    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h

    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
    anchor = anchor.reshape((1, na, 1, 1, 2))
    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)

    return [x1, y1, w1, h1]


def batch_iou_similarity(box1, box2, eps=1e-9):
    """Calculate iou of box1 and box2 in batch

    Args:
        box1 (Tensor): box with the shape [N, M1, 4]
        box2 (Tensor): box with the shape [N, M2, 4]

    Return:
        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
    """
    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
    x1y1 = paddle.maximum(px1y1, gx1y1)
    x2y2 = paddle.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union


def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
    """calculate the iou of box1 and box2

    Args:
        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
        giou (bool): whether use giou or not, default False
        diou (bool): whether use diou or not, default False
        ciou (bool): whether use ciou or not, default False
        eps (float): epsilon to avoid divide by zero

    Return:
        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
    """
    px1, py1, px2, py2 = box1
    gx1, gy1, gx2, gy2 = box2
    x1 = paddle.maximum(px1, gx1)
    y1 = paddle.maximum(py1, gy1)
    x2 = paddle.minimum(px2, gx2)
    y2 = paddle.minimum(py2, gy2)

    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))

    area1 = (px2 - px1) * (py2 - py1)
    area1 = area1.clip(0)

    area2 = (gx2 - gx1) * (gy2 - gy1)
    area2 = area2.clip(0)

    union = area1 + area2 - overlap + eps
    iou = overlap / union

    if giou or ciou or diou:
        # convex w, h
        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
        if giou:
            c_area = cw * ch + eps
            return iou - (c_area - union) / c_area
        else:
            # convex diagonal squared
            c2 = cw**2 + ch**2 + eps
            # center distance
            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
            if diou:
                return iou - rho2 / c2
            else:
                w1, h1 = px2 - px1, py2 - py1 + eps
                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
                v = (4 / math.pi**2) * paddle.pow(delta, 2)
                alpha = v / (1 + eps - iou + v)
                alpha.stop_gradient = True
                return iou - (rho2 / c2 + v * alpha)
    else:
        return iou


def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
    """
    Calculate the iou of box1 and box2 with numpy.

    Args:
        box1 (ndarray): [N, 4]
        box2 (ndarray): [M, 4], usually N != M
        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
        eps (float): epsilon to avoid divide by zero
    Return:
        iou (ndarray): iou of box1 and box2, [N, M]
    """
    N, M = len(box1), len(box2)  # usually N != M
    if x1y1x2y2:
        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
    else:
        # cxcywh style
        # Transform from center and width to exact coordinates
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2

    # get the coordinates of the intersection rectangle
    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
    for i in range(len(box2)):
        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
    # Intersection area
    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
        inter_rect_y2 - inter_rect_y1, 0)
    # Union Area
    b1_area = np.repeat(
        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
    b2_area = np.repeat(
        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)

    ious = inter_area / (b1_area + b2_area - inter_area + eps)
    return ious


def bbox2distance(points, bbox, max_dis=None, eps=0.1):
    """Decode bounding box based on distances.
    Args:
        points (Tensor): Shape (n, 2), [x, y].
        bbox (Tensor): Shape (n, 4), "xyxy" format
        max_dis (float): Upper bound of the distance.
        eps (float): a small value to ensure target < max_dis, instead <=
    Returns:
        Tensor: Decoded distances.
    """
    left = points[:, 0] - bbox[:, 0]
    top = points[:, 1] - bbox[:, 1]
    right = bbox[:, 2] - points[:, 0]
    bottom = bbox[:, 3] - points[:, 1]
    if max_dis is not None:
        left = left.clip(min=0, max=max_dis - eps)
        top = top.clip(min=0, max=max_dis - eps)
        right = right.clip(min=0, max=max_dis - eps)
        bottom = bottom.clip(min=0, max=max_dis - eps)
    return paddle.stack([left, top, right, bottom], -1)


def distance2bbox(points, distance, max_shape=None):
    """Decode distance prediction to bounding box.
        Args:
            points (Tensor): Shape (n, 2), [x, y].
            distance (Tensor): Distance from the given point to 4
                boundaries (left, top, right, bottom).
            max_shape (tuple): Shape of the image.
        Returns:
            Tensor: Decoded bboxes.
        """
    x1 = points[:, 0] - distance[:, 0]
    y1 = points[:, 1] - distance[:, 1]
    x2 = points[:, 0] + distance[:, 2]
    y2 = points[:, 1] + distance[:, 3]
    if max_shape is not None:
        x1 = x1.clip(min=0, max=max_shape[1])
        y1 = y1.clip(min=0, max=max_shape[0])
        x2 = x2.clip(min=0, max=max_shape[1])
        y2 = y2.clip(min=0, max=max_shape[0])
    return paddle.stack([x1, y1, x2, y2], -1)


def bbox_center(boxes):
    """Get bbox centers from boxes.
    Args:
        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
    Returns:
        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
    """
    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
    return paddle.stack([boxes_cx, boxes_cy], axis=-1)


def batch_distance2bbox(points, distance, max_shapes=None):
    """Decode distance prediction to bounding box for batch.
    Args:
        points (Tensor): [B, ..., 2], "xy" format
        distance (Tensor): [B, ..., 4], "ltrb" format
        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
    Returns:
        Tensor: Decoded bboxes, "x1y1x2y2" format.
    """
    lt, rb = paddle.split(distance, 2, -1)
    # while tensor add parameters, parameters should be better placed on the second place
    x1y1 = -lt + points
    x2y2 = rb + points
    out_bbox = paddle.concat([x1y1, x2y2], -1)
    if max_shapes is not None:
        max_shapes = max_shapes.flip(-1).tile([1, 2])
        delta_dim = out_bbox.ndim - max_shapes.ndim
        for _ in range(delta_dim):
            max_shapes.unsqueeze_(1)
        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
        out_bbox = paddle.where(out_bbox > 0, out_bbox,
                                paddle.zeros_like(out_bbox))
    return out_bbox


def iou_similarity(box1, box2, eps=1e-10):
    """Calculate iou of box1 and box2

    Args:
        box1 (Tensor): box with the shape [M1, 4]
        box2 (Tensor): box with the shape [M2, 4]

    Return:
        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
    """
    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
    x1y1 = paddle.maximum(px1y1, gx1y1)
    x2y2 = paddle.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union


================================================
FILE: ppdet/modeling/clrnet_utils.py
================================================
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.initializer import constant_
from paddle.nn.initializer import KaimingNormal


class ConvModule(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=1,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=False,
                 norm_type='bn',
                 wtih_act=True):
        super(ConvModule, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'gn', None]
        self.with_norm = norm_type is not None
        self.wtih_act = wtih_act
        self.conv = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias_attr=bias,
            weight_attr=KaimingNormal())
        if self.with_norm:
            if norm_type == 'bn':
                self.bn = nn.BatchNorm2D(out_channels)
            elif norm_type == 'gn':
                self.bn = nn.GroupNorm(out_channels, out_channels)

        if self.wtih_act:
            self.act = nn.ReLU()

    def forward(self, inputs):
        x = self.conv(inputs)
        if self.with_norm:
            x = self.bn(x)
        if self.wtih_act:
            x = self.act(x)
        return x


def LinearModule(hidden_dim):
    return nn.LayerList(
        [nn.Linear(
            hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()])


class FeatureResize(nn.Layer):
    def __init__(self, size=(10, 25)):
        super(FeatureResize, self).__init__()
        self.size = size

    def forward(self, x):
        x = F.interpolate(x, self.size)
        return x.flatten(2)


class ROIGather(nn.Layer):
    '''
    ROIGather module for gather global information
    Args: 
        in_channels: prior feature channels
        num_priors: prior numbers we predefined
        sample_points: the number of sampled points when we extract feature from line
        fc_hidden_dim: the fc output channel
        refine_layers: the total number of layers to build refine
    '''

    def __init__(self,
                 in_channels,
                 num_priors,
                 sample_points,
                 fc_hidden_dim,
                 refine_layers,
                 mid_channels=48):
        super(ROIGather, self).__init__()
        self.in_channels = in_channels
        self.num_priors = num_priors
        self.f_key = ConvModule(
            in_channels=self.in_channels,
            out_channels=self.in_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            norm_type='bn')

        self.f_query = nn.Sequential(
            nn.Conv1D(
                in_channels=num_priors,
                out_channels=num_priors,
                kernel_size=1,
                stride=1,
                padding=0,
                groups=num_priors),
            nn.ReLU(), )
        self.f_value = nn.Conv2D(
            in_channels=self.in_channels,
            out_channels=self.in_channels,
            kernel_size=1,
            stride=1,
            padding=0)
        self.W = nn.Conv1D(
            in_channels=num_priors,
            out_channels=num_priors,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=num_priors)

        self.resize = FeatureResize()
        constant_(self.W.weight, 0)
        constant_(self.W.bias, 0)

        self.convs = nn.LayerList()
        self.catconv = nn.LayerList()
        for i in range(refine_layers):
            self.convs.append(
                ConvModule(
                    in_channels,
                    mid_channels, (9, 1),
                    padding=(4, 0),
                    bias=False,
                    norm_type='bn'))

            self.catconv.append(
                ConvModule(
                    mid_channels * (i + 1),
                    in_channels, (9, 1),
                    padding=(4, 0),
                    bias=False,
                    norm_type='bn'))

        self.fc = nn.Linear(
            sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True)

        self.fc_norm = nn.LayerNorm(fc_hidden_dim)

    def roi_fea(self, x, layer_index):
        feats = []
        for i, feature in enumerate(x):
            feat_trans = self.convs[i](feature)
            feats.append(feat_trans)
        cat_feat = paddle.concat(feats, axis=1)
        cat_feat = self.catconv[layer_index](cat_feat)
        return cat_feat

    def forward(self, roi_features, x, layer_index):
        '''
        Args:
            roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1)
            x: feature map
            layer_index: currently on which layer to refine
        Return: 
            roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim)
        '''

        roi = self.roi_fea(roi_features, layer_index)
        # return roi
        # print(roi.shape)
        # return roi
        bs = x.shape[0]
        # print(bs)
        #roi = roi.contiguous().view(bs * self.num_priors, -1)
        roi = roi.reshape([bs * self.num_priors, -1])
        # roi = paddle.randn([192,2304])
        # return roi
        # print(roi)
        # print(self.fc)
        # print(self.fc.weight)
        roi = self.fc(roi)
        roi = F.relu(self.fc_norm(roi))
        # return roi
        #roi = roi.view(bs, self.num_priors, -1)
        roi = roi.reshape([bs, self.num_priors, -1])
        query = roi

        value = self.resize(self.f_value(x))  # (B, C, N) global feature
        query = self.f_query(
            query)  # (B, N, 1) sample context feature from prior roi
        key = self.f_key(x)
        value = value.transpose(perm=[0, 2, 1])
        key = self.resize(key)  # (B, C, N) global feature
        sim_map = paddle.matmul(query, key)
        sim_map = (self.in_channels**-.5) * sim_map
        sim_map = F.softmax(sim_map, axis=-1)

        context = paddle.matmul(sim_map, value)
        context = self.W(context)

        roi = roi + F.dropout(context, p=0.1, training=self.training)

        return roi


class SegDecoder(nn.Layer):
    '''
    Optionaly seg decoder
    '''

    def __init__(self,
                 image_height,
                 image_width,
                 num_class,
                 prior_feat_channels=64,
                 refine_layers=3):
        super().__init__()
        self.dropout = nn.Dropout2D(0.1)
        self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1)
        self.image_height = image_height
        self.image_width = image_width

    def forward(self, x):
        x = self.dropout(x)
        x = self.conv(x)
        x = F.interpolate(
            x,
            size=[self.image_height, self.image_width],
            mode='bilinear',
            align_corners=False)
        return x


import paddle.nn as nn


def accuracy(pred, target, topk=1, thresh=None):
    """Calculate accuracy according to the prediction and target.

    Args:
        pred (torch.Tensor): The model prediction, shape (N, num_class)
        target (torch.Tensor): The target of each prediction, shape (N, )
        topk (int | tuple[int], optional): If the predictions in ``topk``
            matches the target, the predictions will be regarded as
            correct ones. Defaults to 1.
        thresh (float, optional): If not None, predictions with scores under
            this threshold are considered incorrect. Default to None.

    Returns:
        float | tuple[float]: If the input ``topk`` is a single integer,
            the function will return a single float as accuracy. If
            ``topk`` is a tuple containing multiple integers, the
            function will return a tuple containing accuracies of
            each ``topk`` number.
    """
    assert isinstance(topk, (int, tuple))
    if isinstance(topk, int):
        topk = (topk, )
        return_single = True
    else:
        return_single = False

    maxk = max(topk)
    if pred.shape[0] == 0:
        accu = [pred.new_tensor(0.) for i in range(len(topk))]
        return accu[0] if return_single else accu
    assert pred.ndim == 2 and target.ndim == 1
    assert pred.shape[0] == target.shape[0]
    assert maxk <= pred.shape[1], \
        f'maxk {maxk} exceeds pred dimension {pred.shape[1]}'
    pred_value, pred_label = pred.topk(maxk, axis=1)
    pred_label = pred_label.t()  # transpose to shape (maxk, N)
    correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label))
    if thresh is not None:
        # Only prediction values larger than thresh are counted as correct
        correct = correct & (pred_value > thresh).t()
    res = []
    for k in topk:
        correct_k = correct[:k].reshape([-1]).cast("float32").sum(0,
                                                                  keepdim=True)
        correct_k = correct_k * (100.0 / pred.shape[0])
        res.append(correct_k)
    return res[0] if return_single else res


class Accuracy(nn.Layer):
    def __init__(self, topk=(1, ), thresh=None):
        """Module to calculate the accuracy.

        Args:
            topk (tuple, optional): The criterion used to calculate the
                accuracy. Defaults to (1,).
            thresh (float, optional): If not None, predictions with scores
                under this threshold are considered incorrect. Default to None.
        """
        super().__init__()
        self.topk = topk
        self.thresh = thresh

    def forward(self, pred, target):
        """Forward function to calculate accuracy.

        Args:
            pred (torch.Tensor): Prediction of models.
            target (torch.Tensor): Target for each prediction.

        Returns:
            tuple[float]: The accuracies under different topk criterions.
        """
        return accuracy(pred, target, self.topk, self.thresh)


================================================
FILE: ppdet/modeling/cls_utils.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def _get_class_default_kwargs(cls, *args, **kwargs):
    """
    Get default arguments of a class in dict format, if args and
    kwargs is specified, it will replace default arguments
    """
    varnames = cls.__init__.__code__.co_varnames
    argcount = cls.__init__.__code__.co_argcount
    keys = varnames[:argcount]
    assert keys[0] == 'self'
    keys = keys[1:]

    values = list(cls.__init__.__defaults__)
    assert len(values) == len(keys)

    if len(args) > 0:
        for i, arg in enumerate(args):
            values[i] = arg

    default_kwargs = dict(zip(keys, values))

    if len(kwargs) > 0:
        for k, v in kwargs.items():
            default_kwargs[k] = v

    return default_kwargs


================================================
FILE: ppdet/modeling/heads/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import bbox_head
from . import mask_head
from . import yolo_head
from . import roi_extractor
from . import ssd_head
from . import fcos_head
from . import solov2_head
from . import ttf_head
from . import cascade_head
from . import face_head
from . import s2anet_head
from . import keypoint_hrhrnet_head
from . import centernet_head
from . import gfl_head
from . import simota_head
from . import pico_head
from . import detr_head
from . import sparsercnn_head
from . import tood_head
from . import retina_head
from . import ppyoloe_head
from . import fcosr_head
from . import ppyoloe_r_head
from . import yolof_head
from . import ppyoloe_contrast_head
from . import centertrack_head
from . import sparse_roi_head
from . import vitpose_head
from . import clrnet_head
from . import ppyoloe_ins_head

from .bbox_head import *
from .mask_head import *
from .yolo_head import *
from .roi_extractor import *
from .ssd_head import *
from .fcos_head import *
from .solov2_head import *
from .ttf_head import *
from .cascade_head import *
from .face_head import *
from .s2anet_head import *
from .keypoint_hrhrnet_head import *
from .centernet_head import *
from .gfl_head import *
from .simota_head import *
from .pico_head import *
from .detr_head import *
from .sparsercnn_head import *
from .tood_head import *
from .retina_head import *
from .ppyoloe_head import *
from .fcosr_head import *
from .ppyoloe_r_head import *
from .yolof_head import *
from .ppyoloe_contrast_head import *
from .centertrack_head import *
from .sparse_roi_head import *
from .petr_head import *
from .vitpose_head import *
from .clrnet_head import *
from .ppyoloe_ins_head import PPYOLOEInsHead


================================================
FILE: ppdet/modeling/heads/bbox_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register, create
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import bbox2delta
from ..cls_utils import _get_class_default_kwargs
from ppdet.modeling.layers import ConvNormLayer

__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']


@register
class TwoFCHead(nn.Layer):
    """
    RCNN bbox head with Two fc layers to extract feature

    Args:
        in_channel (int): Input channel which can be derived by from_config
        out_channel (int): Output channel
        resolution (int): Resolution of input feature map, default 7
    """

    def __init__(self, in_channel=256, out_channel=1024, resolution=7):
        super(TwoFCHead, self).__init__()
        self.in_channel = in_channel
        self.out_channel = out_channel
        fan = in_channel * resolution * resolution
        self.fc6 = nn.Linear(
            in_channel * resolution * resolution,
            out_channel,
            weight_attr=paddle.ParamAttr(
                initializer=XavierUniform(fan_out=fan)))
        self.fc6.skip_quant = True

        self.fc7 = nn.Linear(
            out_channel,
            out_channel,
            weight_attr=paddle.ParamAttr(initializer=XavierUniform()))
        self.fc7.skip_quant = True

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
        return {'in_channel': s.channels}

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat):
        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
        fc6 = self.fc6(rois_feat)
        fc6 = F.relu(fc6)
        fc7 = self.fc7(fc6)
        fc7 = F.relu(fc7)
        return fc7


@register
class XConvNormHead(nn.Layer):
    __shared__ = ['norm_type', 'freeze_norm']
    """
    RCNN bbox head with serveral convolution layers

    Args:
        in_channel (int): Input channels which can be derived by from_config
        num_convs (int): The number of conv layers
        conv_dim (int): The number of channels for the conv layers
        out_channel (int): Output channels
        resolution (int): Resolution of input feature map
        norm_type (string): Norm type, bn, gn, sync_bn are available, 
            default `gn`
        freeze_norm (bool): Whether to freeze the norm
        stage_name (string): Prefix name for conv layer,  '' by default
    """

    def __init__(self,
                 in_channel=256,
                 num_convs=4,
                 conv_dim=256,
                 out_channel=1024,
                 resolution=7,
                 norm_type='gn',
                 freeze_norm=False,
                 stage_name=''):
        super(XConvNormHead, self).__init__()
        self.in_channel = in_channel
        self.num_convs = num_convs
        self.conv_dim = conv_dim
        self.out_channel = out_channel
        self.norm_type = norm_type
        self.freeze_norm = freeze_norm

        self.bbox_head_convs = []
        fan = conv_dim * 3 * 3
        initializer = KaimingNormal(fan_in=fan)
        for i in range(self.num_convs):
            in_c = in_channel if i == 0 else conv_dim
            head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)
            head_conv = self.add_sublayer(
                head_conv_name,
                ConvNormLayer(
                    ch_in=in_c,
                    ch_out=conv_dim,
                    filter_size=3,
                    stride=1,
                    norm_type=self.norm_type,
                    freeze_norm=self.freeze_norm,
                    initializer=initializer))
            self.bbox_head_convs.append(head_conv)

        fan = conv_dim * resolution * resolution
        self.fc6 = nn.Linear(
            conv_dim * resolution * resolution,
            out_channel,
            weight_attr=paddle.ParamAttr(
                initializer=XavierUniform(fan_out=fan)),
            bias_attr=paddle.ParamAttr(
                learning_rate=2., regularizer=L2Decay(0.)))

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
        return {'in_channel': s.channels}

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat):
        for i in range(self.num_convs):
            rois_feat = F.relu(self.bbox_head_convs[i](rois_feat))
        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
        fc6 = F.relu(self.fc6(rois_feat))
        return fc6


@register
class BBoxHead(nn.Layer):
    __shared__ = ['num_classes', 'use_cot']
    __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']
    """
    RCNN bbox head

    Args:
        head (nn.Layer): Extract feature in bbox head
        in_channel (int): Input channel after RoI extractor
        roi_extractor (object): The module of RoI Extractor
        bbox_assigner (object): The module of Box Assigner, label and sample the 
            box.
        with_pool (bool): Whether to use pooling for the RoI feature.
        num_classes (int): The number of classes
        bbox_weight (List[float]): The weight to get the decode box
        cot_classes (int): The number of base classes
        loss_cot (object): The module of Label-cotuning
        use_cot(bool): whether to use Label-cotuning 
    """

    def __init__(self,
                 head,
                 in_channel,
                 roi_extractor=_get_class_default_kwargs(RoIAlign),
                 bbox_assigner='BboxAssigner',
                 with_pool=False,
                 num_classes=80,
                 bbox_weight=[10., 10., 5., 5.],
                 bbox_loss=None,
                 loss_normalize_pos=False,
                 cot_classes=None,
                 loss_cot='COTLoss',
                 use_cot=False):
        super(BBoxHead, self).__init__()
        self.head = head
        self.roi_extractor = roi_extractor
        if isinstance(roi_extractor, dict):
            self.roi_extractor = RoIAlign(**roi_extractor)
        self.bbox_assigner = bbox_assigner

        self.with_pool = with_pool
        self.num_classes = num_classes
        self.bbox_weight = bbox_weight
        self.bbox_loss = bbox_loss
        self.loss_normalize_pos = loss_normalize_pos

        self.loss_cot = loss_cot
        self.cot_relation = None
        self.cot_classes = cot_classes
        self.use_cot = use_cot
        if use_cot:
            self.cot_bbox_score = nn.Linear(
                in_channel,
                self.num_classes + 1,
                weight_attr=paddle.ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)))
            
            self.bbox_score = nn.Linear(
                in_channel,
                self.cot_classes + 1,
                weight_attr=paddle.ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)))
            self.cot_bbox_score.skip_quant = True
        else:
            self.bbox_score = nn.Linear(
                in_channel,
                self.num_classes + 1,
                weight_attr=paddle.ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)))
        self.bbox_score.skip_quant = True

        self.bbox_delta = nn.Linear(
            in_channel,
            4 * self.num_classes,
            weight_attr=paddle.ParamAttr(initializer=Normal(
                mean=0.0, std=0.001)))
        self.bbox_delta.skip_quant = True
        self.assigned_label = None
        self.assigned_rois = None

    def init_cot_head(self, relationship):
        self.cot_relation = relationship
        
    @classmethod
    def from_config(cls, cfg, input_shape):
        roi_pooler = cfg['roi_extractor']
        assert isinstance(roi_pooler, dict)
        kwargs = RoIAlign.from_config(cfg, input_shape)
        roi_pooler.update(kwargs)
        kwargs = {'input_shape': input_shape}
        head = create(cfg['head'], **kwargs)
        return {
            'roi_extractor': roi_pooler,
            'head': head,
            'in_channel': head.out_shape[0].channels
        }

    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):
        """
        body_feats (list[Tensor]): Feature maps from backbone
        rois (list[Tensor]): RoIs generated from RPN module
        rois_num (Tensor): The number of RoIs in each image
        inputs (dict{Tensor}): The ground-truth of image
        """
        if self.training:
            rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs)
            self.assigned_rois = (rois, rois_num)
            self.assigned_targets = targets

        rois_feat = self.roi_extractor(body_feats, rois, rois_num)
        bbox_feat = self.head(rois_feat)
        if self.with_pool:
            feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
            feat = paddle.squeeze(feat, axis=[2, 3])
        else:
            feat = bbox_feat
        if self.use_cot:
            scores = self.cot_bbox_score(feat)
            cot_scores = self.bbox_score(feat)
        else:
            scores = self.bbox_score(feat)
        deltas = self.bbox_delta(feat)

        if self.training:
            loss = self.get_loss(
                scores,
                deltas,
                targets,
                rois,
                self.bbox_weight,
                loss_normalize_pos=self.loss_normalize_pos)
            
            if self.cot_relation is not None:
                loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)
                loss.update(loss_cot)
            return loss, bbox_feat
        else:
            if cot:
                pred = self.get_prediction(cot_scores, deltas)
            else:
                pred = self.get_prediction(scores, deltas)
            return pred, self.head


    def get_loss(self,
                 scores,
                 deltas,
                 targets,
                 rois,
                 bbox_weight,
                 loss_normalize_pos=False):
        """
        scores (Tensor): scores from bbox head outputs
        deltas (Tensor): deltas from bbox head outputs
        targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds
        rois (List[Tensor]): RoIs generated in each batch
        """
        cls_name = 'loss_bbox_cls'
        reg_name = 'loss_bbox_reg'
        loss_bbox = {}

        # TODO: better pass args
        tgt_labels, tgt_bboxes, tgt_gt_inds = targets

        # bbox cls
        tgt_labels = paddle.concat(tgt_labels) if len(
            tgt_labels) > 1 else tgt_labels[0]
        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
        if valid_inds.shape[0] == 0:
            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
        else:
            tgt_labels = tgt_labels.cast('int64')
            tgt_labels.stop_gradient = True

            if not loss_normalize_pos:
                loss_bbox_cls = F.cross_entropy(
                    input=scores, label=tgt_labels, reduction='mean')
            else:
                loss_bbox_cls = F.cross_entropy(
                    input=scores, label=tgt_labels,
                    reduction='none').sum() / (tgt_labels.shape[0] + 1e-7)

            loss_bbox[cls_name] = loss_bbox_cls

        # bbox reg

        cls_agnostic_bbox_reg = deltas.shape[1] == 4

        fg_inds = paddle.nonzero(
            paddle.logical_and(tgt_labels >= 0, tgt_labels <
                               self.num_classes)).flatten()

        if fg_inds.numel() == 0:
            # loss_bbox[reg_name] = paddle.zeros([1], dtype='float32')
            loss_bbox[reg_name] = scores.mean() * 0. + deltas.mean() * 0.
            return loss_bbox

        if cls_agnostic_bbox_reg:
            reg_delta = paddle.gather(deltas, fg_inds)
        else:
            fg_gt_classes = paddle.gather(tgt_labels, fg_inds)

            reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1)
            reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1])

            reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4)

            reg_col_inds = reg_col_inds.reshape([-1, 1])
            reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1)

            reg_delta = paddle.gather(deltas, fg_inds)
            reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4])
        rois = paddle.concat(rois) if len(rois) > 1 else rois[0]
        tgt_bboxes = paddle.concat(tgt_bboxes) if len(
            tgt_bboxes) > 1 else tgt_bboxes[0]

        reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight)
        reg_target = paddle.gather(reg_target, fg_inds)
        reg_target.stop_gradient = True

        if self.bbox_loss is not None:
            reg_delta = self.bbox_transform(reg_delta)
            reg_target = self.bbox_transform(reg_target)

            if not loss_normalize_pos:
                loss_bbox_reg = self.bbox_loss(
                    reg_delta, reg_target).sum() / tgt_labels.shape[0]
                loss_bbox_reg *= self.num_classes

            else:
                loss_bbox_reg = self.bbox_loss(
                    reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7)

        else:
            loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(
            ) / tgt_labels.shape[0]

        loss_bbox[reg_name] = loss_bbox_reg

        return loss_bbox

    def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]):
        wx, wy, ww, wh = weights

        deltas = paddle.reshape(deltas, shape=(0, -1, 4))

        dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx
        dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy
        dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww
        dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh

        dw = paddle.clip(dw, -1.e10, np.log(1000. / 16))
        dh = paddle.clip(dh, -1.e10, np.log(1000. / 16))

        pred_ctr_x = dx
        pred_ctr_y = dy
        pred_w = paddle.exp(dw)
        pred_h = paddle.exp(dh)

        x1 = pred_ctr_x - 0.5 * pred_w
        y1 = pred_ctr_y - 0.5 * pred_h
        x2 = pred_ctr_x + 0.5 * pred_w
        y2 = pred_ctr_y + 0.5 * pred_h

        x1 = paddle.reshape(x1, shape=(-1, ))
        y1 = paddle.reshape(y1, shape=(-1, ))
        x2 = paddle.reshape(x2, shape=(-1, ))
        y2 = paddle.reshape(y2, shape=(-1, ))

        return paddle.concat([x1, y1, x2, y2])

    def get_prediction(self, score, delta):
        bbox_prob = F.softmax(score)
        return delta, bbox_prob

    def get_head(self, ):
        return self.head

    def get_assigned_targets(self, ):
        return self.assigned_targets

    def get_assigned_rois(self, ):
        return self.assigned_rois


================================================
FILE: ppdet/modeling/heads/cascade_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal

from ppdet.core.workspace import register
from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox
from ..cls_utils import _get_class_default_kwargs

__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']


@register
class CascadeTwoFCHead(nn.Layer):
    __shared__ = ['num_cascade_stage']
    """
    Cascade RCNN bbox head  with Two fc layers to extract feature

    Args:
        in_channel (int): Input channel which can be derived by from_config
        out_channel (int): Output channel
        resolution (int): Resolution of input feature map, default 7
        num_cascade_stage (int): The number of cascade stage, default 3
    """

    def __init__(self,
                 in_channel=256,
                 out_channel=1024,
                 resolution=7,
                 num_cascade_stage=3):
        super(CascadeTwoFCHead, self).__init__()

        self.in_channel = in_channel
        self.out_channel = out_channel

        self.head_list = []
        for stage in range(num_cascade_stage):
            head_per_stage = self.add_sublayer(
                str(stage), TwoFCHead(in_channel, out_channel, resolution))
            self.head_list.append(head_per_stage)

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
        return {'in_channel': s.channels}

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat, stage=0):
        out = self.head_list[stage](rois_feat)
        return out


@register
class CascadeXConvNormHead(nn.Layer):
    __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']
    """
    Cascade RCNN bbox head with serveral convolution layers

    Args:
        in_channel (int): Input channels which can be derived by from_config
        num_convs (int): The number of conv layers
        conv_dim (int): The number of channels for the conv layers
        out_channel (int): Output channels
        resolution (int): Resolution of input feature map
        norm_type (string): Norm type, bn, gn, sync_bn are available, 
            default `gn`
        freeze_norm (bool): Whether to freeze the norm
        num_cascade_stage (int): The number of cascade stage, default 3
    """

    def __init__(self,
                 in_channel=256,
                 num_convs=4,
                 conv_dim=256,
                 out_channel=1024,
                 resolution=7,
                 norm_type='gn',
                 freeze_norm=False,
                 num_cascade_stage=3):
        super(CascadeXConvNormHead, self).__init__()
        self.in_channel = in_channel
        self.out_channel = out_channel

        self.head_list = []
        for stage in range(num_cascade_stage):
            head_per_stage = self.add_sublayer(
                str(stage),
                XConvNormHead(
                    in_channel,
                    num_convs,
                    conv_dim,
                    out_channel,
                    resolution,
                    norm_type,
                    freeze_norm,
                    stage_name='stage{}_'.format(stage)))
            self.head_list.append(head_per_stage)

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
        return {'in_channel': s.channels}

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat, stage=0):
        out = self.head_list[stage](rois_feat)
        return out


@register
class CascadeHead(BBoxHead):
    __shared__ = ['num_classes', 'num_cascade_stages']
    __inject__ = ['bbox_assigner', 'bbox_loss']
    """
    Cascade RCNN bbox head

    Args:
        head (nn.Layer): Extract feature in bbox head
        in_channel (int): Input channel after RoI extractor
        roi_extractor (object): The module of RoI Extractor
        bbox_assigner (object): The module of Box Assigner, label and sample the 
            box.
        num_classes (int): The number of classes
        bbox_weight (List[List[float]]): The weight to get the decode box and the 
            length of weight is the number of cascade stage
        num_cascade_stages (int): THe number of stage to refine the box
    """

    def __init__(self,
                 head,
                 in_channel,
                 roi_extractor=_get_class_default_kwargs(RoIAlign),
                 bbox_assigner='BboxAssigner',
                 num_classes=80,
                 bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0],
                              [30.0, 30.0, 15.0, 15.0]],
                 num_cascade_stages=3,
                 bbox_loss=None,
                 reg_class_agnostic=True,
                 stage_loss_weights=None,
                 loss_normalize_pos=False,
                 add_gt_as_proposals=[True, False, False]):

        nn.Layer.__init__(self, )
        self.head = head
        self.roi_extractor = roi_extractor
        if isinstance(roi_extractor, dict):
            self.roi_extractor = RoIAlign(**roi_extractor)
        self.bbox_assigner = bbox_assigner

        self.num_classes = num_classes
        self.bbox_weight = bbox_weight
        self.num_cascade_stages = num_cascade_stages
        self.bbox_loss = bbox_loss
        self.stage_loss_weights = [
            1. / num_cascade_stages for _ in range(num_cascade_stages)
        ] if stage_loss_weights is None else stage_loss_weights
        self.add_gt_as_proposals = add_gt_as_proposals

        assert len(
            self.stage_loss_weights
        ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})'

        self.reg_class_agnostic = reg_class_agnostic
        num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes
        self.loss_normalize_pos = loss_normalize_pos

        self.bbox_score_list = []
        self.bbox_delta_list = []
        for i in range(num_cascade_stages):
            score_name = 'bbox_score_stage{}'.format(i)
            delta_name = 'bbox_delta_stage{}'.format(i)
            bbox_score = self.add_sublayer(
                score_name,
                nn.Linear(
                    in_channel,
                    self.num_classes + 1,
                    weight_attr=paddle.ParamAttr(initializer=Normal(
                        mean=0.0, std=0.01))))

            bbox_delta = self.add_sublayer(
                delta_name,
                nn.Linear(
                    in_channel,
                    num_bbox_delta,
                    weight_attr=paddle.ParamAttr(initializer=Normal(
                        mean=0.0, std=0.001))))
            self.bbox_score_list.append(bbox_score)
            self.bbox_delta_list.append(bbox_delta)
        self.assigned_label = None
        self.assigned_rois = None

    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
        """
        body_feats (list[Tensor]): Feature maps from backbone
        rois (Tensor): RoIs generated from RPN module
        rois_num (Tensor): The number of RoIs in each image
        inputs (dict{Tensor}): The ground-truth of image
        """
        targets = []
        if self.training:
            rois, rois_num, targets = self.bbox_assigner(
                rois,
                rois_num,
                inputs,
                add_gt_as_proposals=self.add_gt_as_proposals[0])
            targets_list = [targets]
            self.assigned_rois = (rois, rois_num)
            self.assigned_targets = targets

        pred_bbox = None
        head_out_list = []
        for i in range(self.num_cascade_stages):
            if i > 0:
                rois, rois_num = self._get_rois_from_boxes(pred_bbox,
                                                           inputs['im_shape'])
                if self.training:
                    rois, rois_num, targets = self.bbox_assigner(
                        rois,
                        rois_num,
                        inputs,
                        i,
                        is_cascade=True,
                        add_gt_as_proposals=self.add_gt_as_proposals[i])
                    targets_list.append(targets)

            rois_feat = self.roi_extractor(body_feats, rois, rois_num)
            bbox_feat = self.head(rois_feat, i)
            scores = self.bbox_score_list[i](bbox_feat)
            deltas = self.bbox_delta_list[i](bbox_feat)

            # TODO (lyuwenyu) Is it correct for only one class ?
            if not self.reg_class_agnostic and i < self.num_cascade_stages - 1:
                deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4])
                labels = scores[:, :-1].argmax(axis=-1)

                if self.training:
                    deltas = deltas[paddle.arange(deltas.shape[0]), labels]
                else:
                    deltas = deltas[((deltas + 10000) * F.one_hot(
                        labels, num_classes=self.num_classes).unsqueeze(-1) != 0
                                     ).nonzero(as_tuple=True)].reshape(
                                         [deltas.shape[0], 4])

            head_out_list.append([scores, deltas, rois])
            pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i])

        if self.training:
            loss = {}
            for stage, value in enumerate(zip(head_out_list, targets_list)):
                (scores, deltas, rois), targets = value
                loss_stage = self.get_loss(
                    scores,
                    deltas,
                    targets,
                    rois,
                    self.bbox_weight[stage],
                    loss_normalize_pos=self.loss_normalize_pos)
                for k, v in loss_stage.items():
                    loss[k + "_stage{}".format(
                        stage)] = v * self.stage_loss_weights[stage]

            return loss, bbox_feat
        else:
            scores, deltas, self.refined_rois = self.get_prediction(
                head_out_list)
            return (deltas, scores), self.head

    def _get_rois_from_boxes(self, boxes, im_shape):
        rois = []
        for i, boxes_per_image in enumerate(boxes):
            clip_box = clip_bbox(boxes_per_image, im_shape[i])
            if self.training:
                keep = nonempty_bbox(clip_box)
                if keep.shape[0] == 0:
                    keep = paddle.zeros([1], dtype='int32')
                clip_box = paddle.gather(clip_box, keep)
            rois.append(clip_box)
        rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois])
        return rois, rois_num

    def _get_pred_bbox(self, deltas, proposals, weights):
        pred_proposals = paddle.concat(proposals) if len(
            proposals) > 1 else proposals[0]
        pred_bbox = delta2bbox(deltas, pred_proposals, weights)
        pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]])
        num_prop = []
        for p in proposals:
            num_prop.append(p.shape[0])

        # NOTE(dev): num_prob will be tagged as LoDTensorArray because it
        # depends on batch_size under @to_static. However the argument
        # num_or_sections in paddle.split does not support LoDTensorArray,
        # so we use [-1] to replace it if num_prop is not list. The modification
        # This ensures the correctness of both dynamic and static graphs.
        if not isinstance(num_prop, list):
            num_prop = [-1]
        return pred_bbox.split(num_prop)

    def get_prediction(self, head_out_list):
        """
        head_out_list(List[Tensor]): scores, deltas, rois
        """
        pred_list = []
        scores_list = [F.softmax(head[0]) for head in head_out_list]
        scores = paddle.add_n(scores_list) / self.num_cascade_stages
        # Get deltas and rois from the last stage
        _, deltas, rois = head_out_list[-1]
        return scores, deltas, rois

    def get_refined_rois(self, ):
        return self.refined_rois


================================================
FILE: ppdet/modeling/heads/centernet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Constant, Uniform
from ppdet.core.workspace import register
from ppdet.modeling.losses import CTFocalLoss, GIoULoss


class ConvLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=False):
        super(ConvLayer, self).__init__()
        bias_attr = False
        fan_in = ch_in * kernel_size**2
        bound = 1 / math.sqrt(fan_in)
        param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))
        if bias:
            bias_attr = paddle.ParamAttr(initializer=Constant(0.))
        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            weight_attr=param_attr,
            bias_attr=bias_attr)

    def forward(self, inputs):
        out = self.conv(inputs)
        return out


@register
class CenterNetHead(nn.Layer):
    """
    Args:
        in_channels (int): the channel number of input to CenterNetHead.
        num_classes (int): the number of classes, 80 (COCO dataset) by default.
        head_planes (int): the channel number in all head, 256 by default.
        prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack
        regress_ltrb (bool): whether to regress left/top/right/bottom or
            width/height for a box, True by default.
        size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.
        loss_weight (dict): the weight of each loss.
        add_iou (bool): whether to add iou branch, False by default.
    """

    __shared__ = ['num_classes']

    def __init__(self,
                 in_channels,
                 num_classes=80,
                 head_planes=256,
                 prior_bias=-2.19,
                 regress_ltrb=True,
                 size_loss='L1',
                 loss_weight={
                     'heatmap': 1.0,
                     'size': 0.1,
                     'offset': 1.0,
                     'iou': 0.0,
                 },
                 add_iou=False):
        super(CenterNetHead, self).__init__()
        self.regress_ltrb = regress_ltrb
        self.loss_weight = loss_weight
        self.add_iou = add_iou

        # heatmap head
        self.heatmap = nn.Sequential(
            ConvLayer(
                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
            nn.ReLU(),
            ConvLayer(
                head_planes,
                num_classes,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=True))
        with paddle.no_grad():
            self.heatmap[2].conv.bias[:] = prior_bias

        # size(ltrb or wh) head
        self.size = nn.Sequential(
            ConvLayer(
                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
            nn.ReLU(),
            ConvLayer(
                head_planes,
                4 if regress_ltrb else 2,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=True))
        self.size_loss = size_loss

        # offset head
        self.offset = nn.Sequential(
            ConvLayer(
                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
            nn.ReLU(),
            ConvLayer(
                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))

        # iou head (optinal)
        if self.add_iou and 'iou' in self.loss_weight:
            self.iou = nn.Sequential(
                ConvLayer(
                    in_channels,
                    head_planes,
                    kernel_size=3,
                    padding=1,
                    bias=True),
                nn.ReLU(),
                ConvLayer(
                    head_planes,
                    4 if regress_ltrb else 2,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    bias=True))

    @classmethod
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channels': input_shape.channels}

    def forward(self, feat, inputs):
        heatmap = F.sigmoid(self.heatmap(feat))
        size = self.size(feat)
        offset = self.offset(feat)
        head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
        if self.add_iou and 'iou' in self.loss_weight:
            iou = self.iou(feat)
            head_outs.update({'iou': iou})

        if self.training:
            losses = self.get_loss(inputs, self.loss_weight, head_outs)
            return losses
        else:
            return head_outs

    def get_loss(self, inputs, weights, head_outs):
        # 1.heatmap(hm) head loss: CTFocalLoss
        heatmap = head_outs['heatmap']
        heatmap_target = inputs['heatmap']
        heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)
        ctfocal_loss = CTFocalLoss()
        heatmap_loss = ctfocal_loss(heatmap, heatmap_target)

        # 2.size(wh) head loss: L1 loss or GIoU loss
        size = head_outs['size']
        index = inputs['index']
        mask = inputs['index_mask']
        size = paddle.transpose(size, perm=[0, 2, 3, 1])
        size_n, _, _, size_c = size.shape
        size = paddle.reshape(size, shape=[size_n, -1, size_c])
        index = paddle.unsqueeze(index, 2)
        batch_inds = list()
        for i in range(size_n):
            batch_ind = paddle.full(
                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
            batch_inds.append(batch_ind)
        batch_inds = paddle.concat(batch_inds, axis=0)
        index = paddle.concat(x=[batch_inds, index], axis=2)
        pos_size = paddle.gather_nd(size, index=index)
        mask = paddle.unsqueeze(mask, axis=2)
        size_mask = paddle.expand_as(mask, pos_size)
        size_mask = paddle.cast(size_mask, dtype=pos_size.dtype)
        pos_num = size_mask.sum()
        size_mask.stop_gradient = True
        if self.size_loss == 'L1':
            if self.regress_ltrb:
                size_target = inputs['size']
                # shape: [bs, max_per_img, 4]
            else:
                if inputs['size'].shape[-1] == 2:
                    # inputs['size'] is wh, and regress as wh
                    # shape: [bs, max_per_img, 2]
                    size_target = inputs['size']
                else:
                    # inputs['size'] is ltrb, but regress as wh
                    # shape: [bs, max_per_img, 4]
                    size_target = inputs['size'][:, :, 0:2] + inputs[
                        'size'][:, :, 2:]

            size_target.stop_gradient = True
            size_loss = F.l1_loss(
                pos_size * size_mask, size_target * size_mask, reduction='sum')
            size_loss = size_loss / (pos_num + 1e-4)
        elif self.size_loss == 'giou':
            size_target = inputs['bbox_xys']
            size_target.stop_gradient = True
            centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0
            centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0
            x1 = centers_x - pos_size[:, :, 0:1]
            y1 = centers_y - pos_size[:, :, 1:2]
            x2 = centers_x + pos_size[:, :, 2:3]
            y2 = centers_y + pos_size[:, :, 3:4]
            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
            giou_loss = GIoULoss(reduction='sum')
            size_loss = giou_loss(
                pred_boxes * size_mask,
                size_target * size_mask,
                iou_weight=size_mask,
                loc_reweight=None)
            size_loss = size_loss / (pos_num + 1e-4)

        # 3.offset(reg) head loss: L1 loss
        offset = head_outs['offset']
        offset_target = inputs['offset']
        offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
        offset_n, _, _, offset_c = offset.shape
        offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
        pos_offset = paddle.gather_nd(offset, index=index)
        offset_mask = paddle.expand_as(mask, pos_offset)
        offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype)
        pos_num = offset_mask.sum()
        offset_mask.stop_gradient = True
        offset_target.stop_gradient = True
        offset_loss = F.l1_loss(
            pos_offset * offset_mask,
            offset_target * offset_mask,
            reduction='sum')
        offset_loss = offset_loss / (pos_num + 1e-4)

        # 4.iou head loss: GIoU loss (optinal)
        if self.add_iou and 'iou' in self.loss_weight:
            iou = head_outs['iou']
            iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
            iou_n, _, _, iou_c = iou.shape
            iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
            pos_iou = paddle.gather_nd(iou, index=index)
            iou_mask = paddle.expand_as(mask, pos_iou)
            iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype)
            pos_num = iou_mask.sum()
            iou_mask.stop_gradient = True
            gt_bbox_xys = inputs['bbox_xys']
            gt_bbox_xys.stop_gradient = True
            centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0
            centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0
            x1 = centers_x - pos_size[:, :, 0:1]
            y1 = centers_y - pos_size[:, :, 1:2]
            x2 = centers_x + pos_size[:, :, 2:3]
            y2 = centers_y + pos_size[:, :, 3:4]
            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
            giou_loss = GIoULoss(reduction='sum')
            iou_loss = giou_loss(
                pred_boxes * iou_mask,
                gt_bbox_xys * iou_mask,
                iou_weight=iou_mask,
                loc_reweight=None)
            iou_loss = iou_loss / (pos_num + 1e-4)

        losses = {
            'heatmap_loss': heatmap_loss,
            'size_loss': size_loss,
            'offset_loss': offset_loss,
        }
        det_loss = weights['heatmap'] * heatmap_loss + weights[
            'size'] * size_loss + weights['offset'] * offset_loss

        if self.add_iou and 'iou' in self.loss_weight:
            losses.update({'iou_loss': iou_loss})
            det_loss += weights['iou'] * iou_loss
        losses.update({'det_loss': det_loss})
        return losses


================================================
FILE: ppdet/modeling/heads/centertrack_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .centernet_head import ConvLayer
from ..keypoint_utils import get_affine_transform

__all__ = ['CenterTrackHead']


@register
class CenterTrackHead(nn.Layer):
    """
    Args:
        in_channels (int): the channel number of input to CenterNetHead.
        num_classes (int): the number of classes, 1 (MOT17 dataset) by default.
        head_planes (int): the channel number in all head, 256 by default.
        task (str): the type of task for regression, 'tracking' by default.
        loss_weight (dict): the weight of each loss.
        add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default.
    """

    __shared__ = ['num_classes']

    def __init__(self,
                 in_channels,
                 num_classes=1,
                 head_planes=256,
                 task='tracking',
                 loss_weight={
                     'tracking': 1.0,
                     'ltrb_amodal': 0.1,
                 },
                 add_ltrb_amodal=True):
        super(CenterTrackHead, self).__init__()
        self.task = task
        self.loss_weight = loss_weight
        self.add_ltrb_amodal = add_ltrb_amodal

        # tracking head
        self.tracking = nn.Sequential(
            ConvLayer(
                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
            nn.ReLU(),
            ConvLayer(
                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))

        # ltrb_amodal head
        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
            self.ltrb_amodal = nn.Sequential(
                ConvLayer(
                    in_channels,
                    head_planes,
                    kernel_size=3,
                    padding=1,
                    bias=True),
                nn.ReLU(),
                ConvLayer(
                    head_planes,
                    4,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    bias=True))

        # TODO: add more tasks

    @classmethod
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channels': input_shape.channels}

    def forward(self,
                feat,
                inputs,
                bboxes=None,
                bbox_inds=None,
                topk_clses=None,
                topk_ys=None,
                topk_xs=None):
        tracking = self.tracking(feat)
        head_outs = {'tracking': tracking}
        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
            ltrb_amodal = self.ltrb_amodal(feat)
            head_outs.update({'ltrb_amodal': ltrb_amodal})

        if self.training:
            losses = self.get_loss(inputs, self.loss_weight, head_outs)
            return losses
        else:
            ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys,
                                      topk_xs)
            return ret

    def get_loss(self, inputs, weights, head_outs):
        index = inputs['index'].unsqueeze(2)
        mask = inputs['index_mask'].unsqueeze(2)
        batch_inds = list()
        for i in range(head_outs['tracking'].shape[0]):
            batch_ind = paddle.full(
                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
            batch_inds.append(batch_ind)
        batch_inds = paddle.concat(batch_inds, axis=0)
        index = paddle.concat(x=[batch_inds, index], axis=2)

        # 1.tracking head loss: L1 loss
        tracking = head_outs['tracking'].transpose([0, 2, 3, 1])
        tracking_target = inputs['tracking']
        bs, _, _, c = tracking.shape
        tracking = tracking.reshape([bs, -1, c])
        pos_tracking = paddle.gather_nd(tracking, index=index)
        tracking_mask = paddle.cast(
            paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype)
        pos_num = tracking_mask.sum()
        tracking_mask.stop_gradient = True
        tracking_target.stop_gradient = True
        tracking_loss = F.l1_loss(
            pos_tracking * tracking_mask,
            tracking_target * tracking_mask,
            reduction='sum')
        tracking_loss = tracking_loss / (pos_num + 1e-4)

        # 2.ltrb_amodal head loss(optinal): L1 loss
        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
            ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1])
            ltrb_amodal_target = inputs['ltrb_amodal']
            bs, _, _, c = ltrb_amodal.shape
            ltrb_amodal = ltrb_amodal.reshape([bs, -1, c])
            pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index)
            ltrb_amodal_mask = paddle.cast(
                paddle.expand_as(mask, pos_ltrb_amodal),
                dtype=pos_ltrb_amodal.dtype)
            pos_num = ltrb_amodal_mask.sum()
            ltrb_amodal_mask.stop_gradient = True
            ltrb_amodal_target.stop_gradient = True
            ltrb_amodal_loss = F.l1_loss(
                pos_ltrb_amodal * ltrb_amodal_mask,
                ltrb_amodal_target * ltrb_amodal_mask,
                reduction='sum')
            ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4)

        losses = {'tracking_loss': tracking_loss, }
        plugin_loss = weights['tracking'] * tracking_loss

        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
            losses.update({'ltrb_amodal_loss': ltrb_amodal_loss})
            plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss
        losses.update({'plugin_loss': plugin_loss})
        return losses

    def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs):
        topk_ys = paddle.floor(topk_ys)  # note: More accurate
        topk_xs = paddle.floor(topk_xs)
        cts = paddle.concat([topk_xs, topk_ys], 1)
        ret = {'bboxes': bboxes, 'cts': cts}

        regression_heads = ['tracking']  # todo: add more tasks
        for head in regression_heads:
            if head in head_outs:
                ret[head] = _tranpose_and_gather_feat(head_outs[head],
                                                      bbox_inds)

        if 'ltrb_amodal' in head_outs:
            ltrb_amodal = head_outs['ltrb_amodal']
            ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds)
            bboxes_amodal = paddle.concat(
                [
                    topk_xs * 1.0 + ltrb_amodal[..., 0:1],
                    topk_ys * 1.0 + ltrb_amodal[..., 1:2],
                    topk_xs * 1.0 + ltrb_amodal[..., 2:3],
                    topk_ys * 1.0 + ltrb_amodal[..., 3:4]
                ],
                axis=1)
            ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1)
            # cls_id, score, x0, y0, x1, y1

        return ret

    def centertrack_post_process(self, dets, meta, out_thresh):
        if not ('bboxes' in dets):
            return [{}]

        preds = []
        c, s = meta['center'].numpy(), meta['scale'].numpy()
        h, w = meta['out_height'].numpy(), meta['out_width'].numpy()
        trans = get_affine_transform(
            center=c[0],
            input_size=s[0],
            rot=0,
            output_size=[w[0], h[0]],
            shift=(0., 0.),
            inv=True).astype(np.float32)
        for i, dets_bbox in enumerate(dets['bboxes']):
            if dets_bbox[1] < out_thresh:
                break
            item = {}
            item['score'] = dets_bbox[1]
            item['class'] = int(dets_bbox[0]) + 1
            item['ct'] = transform_preds_with_trans(
                dets['cts'][i].reshape([1, 2]), trans).reshape(2)

            if 'tracking' in dets:
                tracking = transform_preds_with_trans(
                    (dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]),
                    trans).reshape(2)
                item['tracking'] = tracking - item['ct']

            if 'bboxes' in dets:
                bbox = transform_preds_with_trans(
                    dets_bbox[2:6].reshape([2, 2]), trans).reshape(4)
                item['bbox'] = bbox

            preds.append(item)
        return preds


def transform_preds_with_trans(coords, trans):
    target_coords = np.ones((coords.shape[0], 3), np.float32)
    target_coords[:, :2] = coords
    target_coords = np.dot(trans, target_coords.transpose()).transpose()
    return target_coords[:, :2]


def _tranpose_and_gather_feat(feat, bbox_inds):
    feat = feat.transpose([0, 2, 3, 1])
    feat = feat.reshape([-1, feat.shape[3]])
    feat = paddle.gather(feat, bbox_inds)
    return feat


================================================
FILE: ppdet/modeling/heads/clrnet_head.py
================================================
import math
import paddle
import numpy as np
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

from ppdet.modeling.initializer import normal_
from ppdet.modeling.lane_utils import Lane
from ppdet.modeling.losses import line_iou
from ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder

__all__ = ['CLRHead']


@register
class CLRHead(nn.Layer):
    __inject__ = ['loss']
    __shared__ = [
        'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height',
        'num_points', "max_lanes"
    ]

    def __init__(self,
                 num_points=72,
                 prior_feat_channels=64,
                 fc_hidden_dim=64,
                 num_priors=192,
                 img_w=800,
                 img_h=320,
                 ori_img_h=590,
                 cut_height=270,
                 num_classes=5,
                 num_fc=2,
                 refine_layers=3,
                 sample_points=36,
                 conf_threshold=0.4,
                 nms_thres=0.5,
                 max_lanes=4,
                 loss='CLRNetLoss'):
        super(CLRHead, self).__init__()
        self.img_w = img_w
        self.img_h = img_h
        self.n_strips = num_points - 1
        self.n_offsets = num_points
        self.num_priors = num_priors
        self.sample_points = sample_points
        self.refine_layers = refine_layers
        self.num_classes = num_classes
        self.fc_hidden_dim = fc_hidden_dim
        self.ori_img_h = ori_img_h
        self.cut_height = cut_height
        self.conf_threshold = conf_threshold
        self.nms_thres = nms_thres
        self.max_lanes = max_lanes
        self.prior_feat_channels = prior_feat_channels
        self.loss = loss
        self.register_buffer(
            name='sample_x_indexs',
            tensor=(paddle.linspace(
                start=0, stop=1, num=self.sample_points,
                dtype=paddle.float32) * self.n_strips).astype(dtype='int64'))
        self.register_buffer(
            name='prior_feat_ys',
            tensor=paddle.flip(
                x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips),
                axis=[-1]))
        self.register_buffer(
            name='prior_ys',
            tensor=paddle.linspace(
                start=1, stop=0, num=self.n_offsets).astype('float32'))
        self.prior_feat_channels = prior_feat_channels
        self._init_prior_embeddings()
        init_priors, priors_on_featmap = self.generate_priors_from_embeddings()
        self.register_buffer(name='priors', tensor=init_priors)
        self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap)
        self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes,
                                      self.prior_feat_channels,
                                      self.refine_layers)
        reg_modules = list()
        cls_modules = list()
        for _ in range(num_fc):
            reg_modules += [*LinearModule(self.fc_hidden_dim)]
            cls_modules += [*LinearModule(self.fc_hidden_dim)]
        self.reg_modules = nn.LayerList(sublayers=reg_modules)
        self.cls_modules = nn.LayerList(sublayers=cls_modules)
        self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors,
                                    self.sample_points, self.fc_hidden_dim,
                                    self.refine_layers)
        self.reg_layers = nn.Linear(
            in_features=self.fc_hidden_dim,
            out_features=self.n_offsets + 1 + 2 + 1,
            bias_attr=True)
        self.cls_layers = nn.Linear(
            in_features=self.fc_hidden_dim, out_features=2, bias_attr=True)
        self.init_weights()

    def init_weights(self):
        for m in self.cls_layers.parameters():
            normal_(m, mean=0.0, std=0.001)
        for m in self.reg_layers.parameters():
            normal_(m, mean=0.0, std=0.001)

    def pool_prior_features(self, batch_features, num_priors, prior_xs):
        """
        pool prior feature from feature map.
        Args:
            batch_features (Tensor): Input feature maps, shape: (B, C, H, W) 
        """
        batch_size = batch_features.shape[0]
        prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1])

        prior_ys = self.prior_feat_ys.tile(repeat_times=[
            batch_size * num_priors
        ]).reshape([batch_size, num_priors, -1, 1])
        prior_xs = prior_xs * 2.0 - 1.0
        prior_ys = prior_ys * 2.0 - 1.0
        grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1)
        feature = F.grid_sample(
            x=batch_features, grid=grid,
            align_corners=True).transpose(perm=[0, 2, 1, 3])
        feature = feature.reshape([
            batch_size * num_priors, self.prior_feat_channels,
            self.sample_points, 1
        ])
        return feature

    def generate_priors_from_embeddings(self):
        predictions = self.prior_embeddings.weight
        # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob       
        priors = paddle.zeros(
            (self.num_priors, 2 + 2 + 2 + self.n_offsets),
            dtype=predictions.dtype)
        priors[:, 2:5] = predictions.clone()
        priors[:, 6:] = (
            priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) *
            (self.img_w - 1) +
            ((1 - self.prior_ys.tile([self.num_priors, 1]) -
              priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) *
             self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile(
                 [1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1)
        priors_on_featmap = paddle.index_select(
            priors, 6 + self.sample_x_indexs, axis=-1)
        return priors, priors_on_featmap

    def _init_prior_embeddings(self):
        self.prior_embeddings = nn.Embedding(self.num_priors, 3)
        bottom_priors_nums = self.num_priors * 3 // 4
        left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8
        strip_size = 0.5 / (left_priors_nums // 2 - 1)
        bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1)

        with paddle.no_grad():
            for i in range(left_priors_nums):
                self.prior_embeddings.weight[i, 0] = i // 2 * strip_size
                self.prior_embeddings.weight[i, 1] = 0.0
                self.prior_embeddings.weight[i,
                                             2] = 0.16 if i % 2 == 0 else 0.32

            for i in range(left_priors_nums,
                           left_priors_nums + bottom_priors_nums):
                self.prior_embeddings.weight[i, 0] = 0.0
                self.prior_embeddings.weight[i, 1] = (
                    (i - left_priors_nums) // 4 + 1) * bottom_strip_size
                self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1)

            for i in range(left_priors_nums + bottom_priors_nums,
                           self.num_priors):
                self.prior_embeddings.weight[i, 0] = (
                    i - left_priors_nums - bottom_priors_nums) // 2 * strip_size
                self.prior_embeddings.weight[i, 1] = 1.0
                self.prior_embeddings.weight[i,
                                             2] = 0.68 if i % 2 == 0 else 0.84

    def forward(self, x, inputs=None):
        """
        Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes.
        Each feature is a 4D tensor.
        Args:
            x: input features (list[Tensor])
        Return:
            prediction_list: each layer's prediction result
            seg: segmentation result for auxiliary loss
        """
        batch_features = list(x[len(x) - self.refine_layers:])
        batch_features.reverse()
        batch_size = batch_features[-1].shape[0]

        if self.training:
            self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings(
            )
        priors, priors_on_featmap = self.priors.tile(
            [batch_size, 1,
             1]), self.priors_on_featmap.tile([batch_size, 1, 1])
        predictions_lists = []
        prior_features_stages = []

        for stage in range(self.refine_layers):
            num_priors = priors_on_featmap.shape[1]
            prior_xs = paddle.flip(x=priors_on_featmap, axis=[2])
            batch_prior_features = self.pool_prior_features(
                batch_features[stage], num_priors, prior_xs)
            prior_features_stages.append(batch_prior_features)

            fc_features = self.roi_gather(prior_features_stages,
                                          batch_features[stage], stage)
            # return fc_features
            fc_features = fc_features.reshape(
                [num_priors, batch_size, -1]).reshape(
                    [batch_size * num_priors, self.fc_hidden_dim])
            cls_features = fc_features.clone()
            reg_features = fc_features.clone()

            for cls_layer in self.cls_modules:
                cls_features = cls_layer(cls_features)

            # return cls_features
            for reg_layer in self.reg_modules:
                reg_features = reg_layer(reg_features)
            cls_logits = self.cls_layers(cls_features)
            reg = self.reg_layers(reg_features)

            cls_logits = cls_logits.reshape(
                [batch_size, -1, cls_logits.shape[1]])
            reg = reg.reshape([batch_size, -1, reg.shape[1]])
            predictions = priors.clone()
            predictions[:, :, :2] = cls_logits
            predictions[:, :, 2:5] += reg[:, :, :3]
            predictions[:, :, 5] = reg[:, :, 3]

            def tran_tensor(t):
                return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets])

            predictions[..., 6:] = (
                tran_tensor(predictions[..., 3]) * (self.img_w - 1) +
                ((1 - self.prior_ys.tile([batch_size, num_priors, 1]) -
                  tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan(
                      tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / (
                          self.img_w - 1)

            prediction_lines = predictions.clone()
            predictions[..., 6:] += reg[..., 4:]
            predictions_lists.append(predictions)

            if stage != self.refine_layers - 1:
                priors = prediction_lines.detach().clone()
                priors_on_featmap = priors.index_select(
                    6 + self.sample_x_indexs, axis=-1)

        if self.training:
            seg = None
            seg_features = paddle.concat(
                [
                    F.interpolate(
                        feature,
                        size=[
                            batch_features[-1].shape[2],
                            batch_features[-1].shape[3]
                        ],
                        mode='bilinear',
                        align_corners=False) for feature in batch_features
                ],
                axis=1)

            seg = self.seg_decoder(seg_features)

            output = {'predictions_lists': predictions_lists, 'seg': seg}
            return self.loss(output, inputs)
        return predictions_lists[-1]

    def predictions_to_pred(self, predictions):
        """
        Convert predictions to internal Lane structure for evaluation.
        """
        self.prior_ys = paddle.to_tensor(self.prior_ys)
        self.prior_ys = self.prior_ys.astype('float64')
        lanes = []
        for lane in predictions:
            lane_xs = lane[6:].clone()
            start = min(
                max(0, int(round(lane[2].item() * self.n_strips))),
                self.n_strips)
            length = int(round(lane[5].item()))
            end = start + length - 1
            end = min(end, len(self.prior_ys) - 1)
            if start > 0:
                mask = ((lane_xs[:start] >= 0.) &
                        (lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1]
                mask = ~((mask.cumprod()[::-1]).astype(np.bool_))
                lane_xs[:start][mask] = -2
            if end < len(self.prior_ys) - 1:
                lane_xs[end + 1:] = -2

            lane_ys = self.prior_ys[lane_xs >= 0].clone()
            lane_xs = lane_xs[lane_xs >= 0]
            lane_xs = lane_xs.flip(axis=0).astype('float64')
            lane_ys = lane_ys.flip(axis=0)

            lane_ys = (lane_ys *
                       (self.ori_img_h - self.cut_height) + self.cut_height
                       ) / self.ori_img_h
            if len(lane_xs) <= 1:
                continue
            points = paddle.stack(
                x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])),
                axis=1).squeeze(axis=2)
            lane = Lane(
                points=points.cpu().numpy(),
                metadata={
                    'start_x': lane[3],
                    'start_y': lane[2],
                    'conf': lane[1]
                })
            lanes.append(lane)
        return lanes

    def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k):
        """
        NMS for lane detection.
        predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77]
        scores: paddle.Tensor [num_lanes]
        nms_overlap_thresh: float
        top_k: int
        """
        # sort by scores to get idx
        idx = scores.argsort(descending=True)
        keep = []

        condidates = predictions.clone()
        condidates = condidates.index_select(idx)

        while len(condidates) > 0:
            keep.append(idx[0])
            if len(keep) >= top_k or len(condidates) == 1:
                break

            ious = []
            for i in range(1, len(condidates)):
                ious.append(1 - line_iou(
                    condidates[i].unsqueeze(0),
                    condidates[0].unsqueeze(0),
                    img_w=self.img_w,
                    length=15))
            ious = paddle.to_tensor(ious)

            mask = ious <= nms_overlap_thresh
            id = paddle.where(mask == False)[0]

            if id.shape[0] == 0:
                break
            condidates = condidates[1:].index_select(id)
            idx = idx[1:].index_select(id)
        keep = paddle.stack(keep)

        return keep

    def get_lanes(self, output, as_lanes=True):
        """
        Convert model output to lanes.
        """
        softmax = nn.Softmax(axis=1)
        decoded = []

        for predictions in output:
            threshold = self.conf_threshold
            scores = softmax(predictions[:, :2])[:, 1]
            keep_inds = scores >= threshold
            predictions = predictions[keep_inds]
            scores = scores[keep_inds]

            if predictions.shape[0] == 0:
                decoded.append([])
                continue
            nms_predictions = predictions.detach().clone()
            nms_predictions = paddle.concat(
                x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1)

            nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips
            nms_predictions[..., 5:] = nms_predictions[..., 5:] * (
                self.img_w - 1)

            keep = self.lane_nms(
                nms_predictions[..., 5:],
                scores,
                nms_overlap_thresh=self.nms_thres,
                top_k=self.max_lanes)

            predictions = predictions.index_select(keep)

            if predictions.shape[0] == 0:
                decoded.append([])
                continue
            predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips)
            if as_lanes:
                pred = self.predictions_to_pred(predictions)
            else:
                pred = predictions
            decoded.append(pred)
        return decoded


================================================
FILE: ppdet/modeling/heads/detr_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import pycocotools.mask as mask_util
from ..initializer import linear_init_, constant_
from ..transformers.utils import inverse_sigmoid

__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead', 'DINOv3Head']


class MLP(nn.Layer):
    """This code is based on
        https://github.com/facebookresearch/detr/blob/main/models/detr.py
    """

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.LayerList(
            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

        self._reset_parameters()

    def _reset_parameters(self):
        for l in self.layers:
            linear_init_(l)

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


class MultiHeadAttentionMap(nn.Layer):
    """This code is based on
        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py

        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
    """

    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
                 bias=True):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout)

        weight_attr = paddle.ParamAttr(
            initializer=paddle.nn.initializer.XavierUniform())
        bias_attr = paddle.framework.ParamAttr(
            initializer=paddle.nn.initializer.Constant()) if bias else False

        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
        self.k_proj = nn.Conv2D(
            query_dim,
            hidden_dim,
            1,
            weight_attr=weight_attr,
            bias_attr=bias_attr)

        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5

    def forward(self, q, k, mask=None):
        q = self.q_proj(q)
        k = self.k_proj(k)
        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
        qh = q.reshape([bs, num_queries, n, c])
        kh = k.reshape([bs, n, c, h, w])
        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
        kh = kh.reshape([-1, c, h * w])
        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])

        if mask is not None:
            weights += mask
        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
        weights = self.dropout(weights)
        return weights


class MaskHeadFPNConv(nn.Layer):
    """This code is based on
        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py

        Simple convolutional head, using group norm.
        Upsampling is done using a FPN approach
    """

    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
        super().__init__()

        inter_dims = [input_dim,
                      ] + [context_dim // (2**i) for i in range(1, 5)]
        weight_attr = paddle.ParamAttr(
            initializer=paddle.nn.initializer.KaimingUniform())
        bias_attr = paddle.framework.ParamAttr(
            initializer=paddle.nn.initializer.Constant())

        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
                                       weight_attr, bias_attr)
        self.conv_inter = nn.LayerList()
        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
            self.conv_inter.append(
                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
                                  bias_attr))

        self.conv_out = nn.Conv2D(
            inter_dims[-1],
            1,
            3,
            padding=1,
            weight_attr=weight_attr,
            bias_attr=bias_attr)

        self.adapter = nn.LayerList()
        for i in range(len(fpn_dims)):
            self.adapter.append(
                nn.Conv2D(
                    fpn_dims[i],
                    inter_dims[i + 1],
                    1,
                    weight_attr=weight_attr,
                    bias_attr=bias_attr))

    def _make_layers(self,
                     in_dims,
                     out_dims,
                     kernel_size,
                     num_groups,
                     weight_attr=None,
                     bias_attr=None):
        return nn.Sequential(
            nn.Conv2D(
                in_dims,
                out_dims,
                kernel_size,
                padding=kernel_size // 2,
                weight_attr=weight_attr,
                bias_attr=bias_attr),
            nn.GroupNorm(num_groups, out_dims),
            nn.ReLU())

    def forward(self, x, bbox_attention_map, fpns):
        x = paddle.concat([
            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
            bbox_attention_map.flatten(0, 1)
        ], 1)
        x = self.conv0(x)
        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
                                                    self.adapter, fpns):
            feat = adapter_layer(feat).tile(
                [bbox_attention_map.shape[1], 1, 1, 1])
            x = inter_layer(x)
            x = feat + F.interpolate(x, size=feat.shape[-2:])

        x = self.conv_inter[-1](x)
        x = self.conv_out(x)
        return x


@register
class DETRHead(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
    __inject__ = ['loss']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 nhead=8,
                 num_mlp_layers=3,
                 loss='DETRLoss',
                 fpn_dims=[1024, 512, 256],
                 with_mask_head=False,
                 use_focal_loss=False):
        super(DETRHead, self).__init__()
        # add background class
        self.num_classes = num_classes if use_focal_loss else num_classes + 1
        self.hidden_dim = hidden_dim
        self.loss = loss
        self.with_mask_head = with_mask_head
        self.use_focal_loss = use_focal_loss

        self.score_head = nn.Linear(hidden_dim, self.num_classes)
        self.bbox_head = MLP(hidden_dim,
                             hidden_dim,
                             output_dim=4,
                             num_layers=num_mlp_layers)
        if self.with_mask_head:
            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
                                                        nhead)
            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
                                             hidden_dim)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.score_head)

    @classmethod
    def from_config(cls, cfg, hidden_dim, nhead, input_shape):

        return {
            'hidden_dim': hidden_dim,
            'nhead': nhead,
            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
        }

    @staticmethod
    def get_gt_mask_from_polygons(gt_poly, pad_mask):
        out_gt_mask = []
        for polygons, padding in zip(gt_poly, pad_mask):
            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
            masks = []
            for obj_poly in polygons:
                rles = mask_util.frPyObjects(obj_poly, height, width)
                rle = mask_util.merge(rles)
                masks.append(
                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
            masks = paddle.stack(masks)
            masks_pad = paddle.zeros(
                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
            masks_pad[:, :height, :width] = masks
            out_gt_mask.append(masks_pad)
        return out_gt_mask

    def forward(self, out_transformer, body_feats, inputs=None):
        r"""
        Args:
            out_transformer (Tuple): (feats: [num_levels, batch_size,
                                                num_queries, hidden_dim],
                            memory: [batch_size, hidden_dim, h, w],
                            src_proj: [batch_size, h*w, hidden_dim],
                            src_mask: [batch_size, 1, 1, h, w])
            body_feats (List(Tensor)): list[[B, C, H, W]]
            inputs (dict): dict(inputs)
        """
        feats, memory, src_proj, src_mask = out_transformer
        outputs_logit = self.score_head(feats)
        outputs_bbox = F.sigmoid(self.bbox_head(feats))
        outputs_seg = None
        if self.with_mask_head:
            bbox_attention_map = self.bbox_attention(feats[-1], memory,
                                                     src_mask)
            fpn_feats = [a for a in body_feats[::-1]][1:]
            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
                                         fpn_feats)
            outputs_seg = outputs_seg.reshape([
                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
                outputs_seg.shape[-1]
            ])

        if self.training:
            assert inputs is not None
            assert 'gt_bbox' in inputs and 'gt_class' in inputs
            gt_mask = self.get_gt_mask_from_polygons(
                inputs['gt_poly'],
                inputs['pad_mask']) if 'gt_poly' in inputs else None
            return self.loss(
                outputs_bbox,
                outputs_logit,
                inputs['gt_bbox'],
                inputs['gt_class'],
                masks=outputs_seg,
                gt_mask=gt_mask)
        else:
            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)


@register
class DeformableDETRHead(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim']
    __inject__ = ['loss']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=512,
                 nhead=8,
                 num_mlp_layers=3,
                 loss='DETRLoss'):
        super(DeformableDETRHead, self).__init__()
        self.num_classes = num_classes
        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.loss = loss

        self.score_head = nn.Linear(hidden_dim, self.num_classes)
        self.bbox_head = MLP(hidden_dim,
                             hidden_dim,
                             output_dim=4,
                             num_layers=num_mlp_layers)

        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.score_head)
        constant_(self.score_head.bias, -4.595)
        constant_(self.bbox_head.layers[-1].weight)

        with paddle.no_grad():
            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
            bias[2:] = -2.0
            self.bbox_head.layers[-1].bias.set_value(bias)

    @classmethod
    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
        return {'hidden_dim': hidden_dim, 'nhead': nhead}

    def forward(self, out_transformer, body_feats, inputs=None):
        r"""
        Args:
            out_transformer (Tuple): (feats: [num_levels, batch_size,
                                                num_queries, hidden_dim],
                            memory: [batch_size,
                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
                            reference_points: [batch_size, num_queries, 2])
            body_feats (List(Tensor)): list[[B, C, H, W]]
            inputs (dict): dict(inputs)
        """
        feats, memory, reference_points = out_transformer
        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
        outputs_bbox = self.bbox_head(feats)

        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
        # but the gradient is wrong in paddle.
        outputs_bbox = paddle.concat(
            [
                outputs_bbox[:, :, :, :2] + reference_points,
                outputs_bbox[:, :, :, 2:]
            ],
            axis=-1)

        outputs_bbox = F.sigmoid(outputs_bbox)
        outputs_logit = self.score_head(feats)

        if self.training:
            assert inputs is not None
            assert 'gt_bbox' in inputs and 'gt_class' in inputs

            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
                             inputs['gt_class'])
        else:
            return (outputs_bbox[-1], outputs_logit[-1], None)


@register
class DINOHead(nn.Layer):
    __inject__ = ['loss']

    def __init__(self, loss='DINOLoss', eval_idx=-1):
        super(DINOHead, self).__init__()
        self.loss = loss
        self.eval_idx = eval_idx

    def forward(self, out_transformer, body_feats, inputs=None):
        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
         dn_meta) = out_transformer
        if self.training:
            assert inputs is not None
            assert 'gt_bbox' in inputs and 'gt_class' in inputs

            if dn_meta is not None:
                if isinstance(dn_meta, list):
                    dual_groups = len(dn_meta) - 1
                    dec_out_bboxes = paddle.split(
                        dec_out_bboxes, dual_groups + 1, axis=2)
                    dec_out_logits = paddle.split(
                        dec_out_logits, dual_groups + 1, axis=2)
                    enc_topk_bboxes = paddle.split(
                        enc_topk_bboxes, dual_groups + 1, axis=1)
                    enc_topk_logits = paddle.split(
                        enc_topk_logits, dual_groups + 1, axis=1)

                    dec_out_bboxes_list = []
                    dec_out_logits_list = []
                    dn_out_bboxes_list = []
                    dn_out_logits_list = []
                    loss = {}
                    for g_id in range(dual_groups + 1):
                        if dn_meta[g_id] is not None:
                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
                                dec_out_bboxes[g_id],
                                dn_meta[g_id]['dn_num_split'],
                                axis=2)
                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
                                dec_out_logits[g_id],
                                dn_meta[g_id]['dn_num_split'],
                                axis=2)
                        else:
                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
                            dec_out_logits_gid = dec_out_logits[g_id]
                        out_bboxes_gid = paddle.concat([
                            enc_topk_bboxes[g_id].unsqueeze(0),
                            dec_out_bboxes_gid
                        ])
                        out_logits_gid = paddle.concat([
                            enc_topk_logits[g_id].unsqueeze(0),
                            dec_out_logits_gid
                        ])
                        loss_gid = self.loss(
                            out_bboxes_gid,
                            out_logits_gid,
                            inputs['gt_bbox'],
                            inputs['gt_class'],
                            dn_out_bboxes=dn_out_bboxes_gid,
                            dn_out_logits=dn_out_logits_gid,
                            dn_meta=dn_meta[g_id])
                        # sum loss
                        for key, value in loss_gid.items():
                            loss.update({
                                key: loss.get(key, paddle.zeros([1])) + value
                            })

                    # average across (dual_groups + 1)
                    for key, value in loss.items():
                        loss.update({key: value / (dual_groups + 1)})
                    return loss
                else:
                    dn_out_bboxes, dec_out_bboxes = paddle.split(
                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
                    dn_out_logits, dec_out_logits = paddle.split(
                        dec_out_logits, dn_meta['dn_num_split'], axis=2)
            else:
                dn_out_bboxes, dn_out_logits = None, None

            out_bboxes = paddle.concat(
                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
            out_logits = paddle.concat(
                [enc_topk_logits.unsqueeze(0), dec_out_logits])

            return self.loss(
                out_bboxes,
                out_logits,
                inputs['gt_bbox'],
                inputs['gt_class'],
                dn_out_bboxes=dn_out_bboxes,
                dn_out_logits=dn_out_logits,
                dn_meta=dn_meta,
                gt_score=inputs.get('gt_score', None))
        else:
            return (dec_out_bboxes[self.eval_idx],
                    dec_out_logits[self.eval_idx], None)


@register
class MaskDINOHead(nn.Layer):
    __inject__ = ['loss']

    def __init__(self, loss='DINOLoss'):
        super(MaskDINOHead, self).__init__()
        self.loss = loss

    def forward(self, out_transformer, body_feats, inputs=None):
        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
         dn_meta) = out_transformer
        if self.training:
            assert inputs is not None
            assert 'gt_bbox' in inputs and 'gt_class' in inputs
            assert 'gt_segm' in inputs

            if dn_meta is not None:
                dn_out_logits, dec_out_logits = paddle.split(
                    dec_out_logits, dn_meta['dn_num_split'], axis=2)
                dn_out_bboxes, dec_out_bboxes = paddle.split(
                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
                dn_out_masks, dec_out_masks = paddle.split(
                    dec_out_masks, dn_meta['dn_num_split'], axis=2)
                if init_out is not None:
                    init_out_logits, init_out_bboxes, init_out_masks = init_out
                    init_out_logits_dn, init_out_logits = paddle.split(
                        init_out_logits, dn_meta['dn_num_split'], axis=1)
                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)
                    init_out_masks_dn, init_out_masks = paddle.split(
                        init_out_masks, dn_meta['dn_num_split'], axis=1)

                    dec_out_logits = paddle.concat(
                        [init_out_logits.unsqueeze(0), dec_out_logits])
                    dec_out_bboxes = paddle.concat(
                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])
                    dec_out_masks = paddle.concat(
                        [init_out_masks.unsqueeze(0), dec_out_masks])

                    dn_out_logits = paddle.concat(
                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])
                    dn_out_bboxes = paddle.concat(
                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
                    dn_out_masks = paddle.concat(
                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])
            else:
                dn_out_bboxes, dn_out_logits = None, None
                dn_out_masks = None

            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
            out_logits = paddle.concat(
                [enc_out_logits.unsqueeze(0), dec_out_logits])
            out_bboxes = paddle.concat(
                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
            out_masks = paddle.concat(
                [enc_out_masks.unsqueeze(0), dec_out_masks])

            inputs['gt_segm'] = [gt_segm.astype(out_masks.dtype)
                                 for gt_segm in inputs['gt_segm']]

            return self.loss(
                out_bboxes,
                out_logits,
                inputs['gt_bbox'],
                inputs['gt_class'],
                masks=out_masks,
                gt_mask=inputs['gt_segm'],
                dn_out_logits=dn_out_logits,
                dn_out_bboxes=dn_out_bboxes,
                dn_out_masks=dn_out_masks,
                dn_meta=dn_meta)
        else:
            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])

@register
class DINOv3Head(nn.Layer):
    __inject__ = ['loss']
    __shared__ = ['o2m_branch', 'num_queries_o2m']


    def __init__(self, loss='DINOLoss', eval_idx=-1, o2m=4, o2m_branch=False, num_queries_o2m=450):
        super(DINOv3Head, self).__init__()
        self.loss = loss
        self.eval_idx = eval_idx
        self.o2m = o2m
        self.o2m_branch = o2m_branch
        self.num_queries_o2m = num_queries_o2m

    def forward(self, out_transformer, body_feats, inputs=None):
        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
         dn_meta) = out_transformer
        if self.training:
            assert inputs is not None
            assert 'gt_bbox' in inputs and 'gt_class' in inputs

            if dn_meta is not None:
                num_groups = len(dn_meta)
                total_dec_queries = dec_out_bboxes.shape[2]
                total_enc_queries = enc_topk_bboxes.shape[1]
                loss = {}
                if self.o2m_branch:
                    dec_out_bboxes, dec_out_bboxes_o2m = paddle.split(dec_out_bboxes, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)
                    dec_out_logits, dec_out_logits_o2m = paddle.split(dec_out_logits, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)
                    enc_topk_bboxes, enc_topk_bboxes_o2m = paddle.split(enc_topk_bboxes, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)
                    enc_topk_logits, enc_topk_logits_o2m = paddle.split(enc_topk_logits, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)

                    out_bboxes_o2m = paddle.concat([enc_topk_bboxes_o2m.unsqueeze(0), dec_out_bboxes_o2m])
                    out_logits_o2m = paddle.concat([enc_topk_logits_o2m.unsqueeze(0), dec_out_logits_o2m])
                    loss_o2m = self.loss(
                        out_bboxes_o2m,
                        out_logits_o2m,
                        inputs['gt_bbox'],
                        inputs['gt_class'],
                        dn_out_bboxes=None,
                        dn_out_logits=None,
                        dn_meta=None,
                        o2m=self.o2m)
                    for key, value in loss_o2m.items():
                        key = key + '_o2m_branch'
                        loss.update({
                            key: loss.get(key, paddle.zeros([1])) + value
                        })
                
                split_dec_num = [sum(dn['dn_num_split']) for dn in dn_meta]
                split_enc_num = [dn['dn_num_split'][1] for dn in dn_meta]
                dec_out_bboxes = paddle.split(dec_out_bboxes, split_dec_num, axis=2)
                dec_out_logits = paddle.split(dec_out_logits, split_dec_num, axis=2)
                enc_topk_bboxes = paddle.split(enc_topk_bboxes, split_enc_num, axis=1)
                enc_topk_logits = paddle.split(enc_topk_logits, split_enc_num, axis=1)

                for g_id in range(num_groups):
                    dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
                        dec_out_bboxes[g_id], dn_meta[g_id]['dn_num_split'], axis=2)
                    dn_out_logits_gid, dec_out_logits_gid = paddle.split(
                        dec_out_logits[g_id], dn_meta[g_id]['dn_num_split'], axis=2)
                    out_bboxes_gid = paddle.concat([
                        enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid])
                    out_logits_gid = paddle.concat([
                        enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid])
                    
                    loss_gid = self.loss(
                        out_bboxes_gid,
                        out_logits_gid,
                        inputs['gt_bbox'],
                        inputs['gt_class'],
                        dn_out_bboxes=dn_out_bboxes_gid,
                        dn_out_logits=dn_out_logits_gid,
                        dn_meta=dn_meta[g_id])
                    # sum loss
                    for key, value in loss_gid.items():
                        loss.update({
                            key: loss.get(key, paddle.zeros([1])) + value
                        })

                # average across (dual_groups + 1)
                for key, value in loss.items():
                    if '_o2m_branch' not in key:
                        loss.update({key: value / num_groups})
                return loss
            else:
                dn_out_bboxes, dn_out_logits = None, None

            out_bboxes = paddle.concat(
                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
            out_logits = paddle.concat(
                [enc_topk_logits.unsqueeze(0), dec_out_logits])

            return self.loss(
                out_bboxes,
                out_logits,
                inputs['gt_bbox'],
                inputs['gt_class'],
                dn_out_bboxes=dn_out_bboxes,
                dn_out_logits=dn_out_logits,
                dn_meta=dn_meta,
                gt_score=inputs.get('gt_score', None))
        else:
            return (dec_out_bboxes[self.eval_idx],
                    dec_out_logits[self.eval_idx], None)


================================================
FILE: ppdet/modeling/heads/face_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn

from ppdet.core.workspace import register
from ..layers import AnchorGeneratorSSD
from ..cls_utils import _get_class_default_kwargs


@register
class FaceHead(nn.Layer):
    """
    Head block for Face detection network

    Args:
        num_classes (int): Number of output classes.
        in_channels (int): Number of input channels.
        anchor_generator(object): instance of anchor genertor method.
        kernel_size (int): kernel size of Conv2D in FaceHead.
        padding (int): padding of Conv2D in FaceHead.
        conv_decay (float): norm_decay (float): weight decay for conv layer weights.
        loss (object): loss of face detection model.
    """
    __shared__ = ['num_classes']
    __inject__ = ['anchor_generator', 'loss']

    def __init__(self,
                 num_classes=80,
                 in_channels=[96, 96],
                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
                 kernel_size=3,
                 padding=1,
                 conv_decay=0.,
                 loss='SSDLoss'):
        super(FaceHead, self).__init__()
        # add background class
        self.num_classes = num_classes + 1
        self.in_channels = in_channels
        self.anchor_generator = anchor_generator
        self.loss = loss

        if isinstance(anchor_generator, dict):
            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)

        self.num_priors = self.anchor_generator.num_priors
        self.box_convs = []
        self.score_convs = []
        for i, num_prior in enumerate(self.num_priors):
            box_conv_name = "boxes{}".format(i)
            box_conv = self.add_sublayer(
                box_conv_name,
                nn.Conv2D(
                    in_channels=self.in_channels[i],
                    out_channels=num_prior * 4,
                    kernel_size=kernel_size,
                    padding=padding))
            self.box_convs.append(box_conv)

            score_conv_name = "scores{}".format(i)
            score_conv = self.add_sublayer(
                score_conv_name,
                nn.Conv2D(
                    in_channels=self.in_channels[i],
                    out_channels=num_prior * self.num_classes,
                    kernel_size=kernel_size,
                    padding=padding))
            self.score_convs.append(score_conv)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def forward(self, feats, image, gt_bbox=None, gt_class=None):
        box_preds = []
        cls_scores = []
        prior_boxes = []
        for feat, box_conv, score_conv in zip(feats, self.box_convs,
                                              self.score_convs):
            box_pred = box_conv(feat)
            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
            box_pred = paddle.reshape(box_pred, [0, -1, 4])
            box_preds.append(box_pred)

            cls_score = score_conv(feat)
            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
            cls_scores.append(cls_score)

        prior_boxes = self.anchor_generator(feats, image)

        if self.training:
            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
                                 prior_boxes)
        else:
            return (box_preds, cls_scores), prior_boxes

    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)


================================================
FILE: ppdet/modeling/heads/fcos_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant

from ppdet.core.workspace import register
from ppdet.modeling.layers import ConvNormLayer, MultiClassNMS

__all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL']


class ScaleReg(nn.Layer):
    """
    Parameter for scaling the regression outputs.
    """

    def __init__(self):
        super(ScaleReg, self).__init__()
        self.scale_reg = self.create_parameter(
            shape=[1],
            attr=ParamAttr(initializer=Constant(value=1.)),
            dtype="float32")

    def forward(self, inputs):
        out = inputs * self.scale_reg
        return out


@register
class FCOSFeat(nn.Layer):
    """
    FCOSFeat of FCOS

    Args:
        feat_in (int): The channel number of input Tensor.
        feat_out (int): The channel number of output Tensor.
        num_convs (int): The convolution number of the FCOSFeat.
        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
        use_dcn (bool): Whether to use dcn in tower or not.
    """

    def __init__(self,
                 feat_in=256,
                 feat_out=256,
                 num_convs=4,
                 norm_type='bn',
                 use_dcn=False):
        super(FCOSFeat, self).__init__()
        self.feat_in = feat_in
        self.feat_out = feat_out
        self.num_convs = num_convs
        self.norm_type = norm_type
        self.cls_subnet_convs = []
        self.reg_subnet_convs = []
        for i in range(self.num_convs):
            in_c = feat_in if i == 0 else feat_out

            cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i)
            cls_conv = self.add_sublayer(
                cls_conv_name,
                ConvNormLayer(
                    ch_in=in_c,
                    ch_out=feat_out,
                    filter_size=3,
                    stride=1,
                    norm_type=norm_type,
                    use_dcn=use_dcn,
                    bias_on=True,
                    lr_scale=2.))
            self.cls_subnet_convs.append(cls_conv)

            reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i)
            reg_conv = self.add_sublayer(
                reg_conv_name,
                ConvNormLayer(
                    ch_in=in_c,
                    ch_out=feat_out,
                    filter_size=3,
                    stride=1,
                    norm_type=norm_type,
                    use_dcn=use_dcn,
                    bias_on=True,
                    lr_scale=2.))
            self.reg_subnet_convs.append(reg_conv)

    def forward(self, fpn_feat):
        cls_feat = fpn_feat
        reg_feat = fpn_feat
        for i in range(self.num_convs):
            cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat))
            reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat))
        return cls_feat, reg_feat


@register
class FCOSHead(nn.Layer):
    """
    FCOSHead
    Args:
        num_classes (int): Number of classes
        fcos_feat (object): Instance of 'FCOSFeat'
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        norm_reg_targets (bool): Normalization the regression target if true
        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
        num_shift (float): Relative offset between the center of the first shift and the top-left corner of img
        fcos_loss (object): Instance of 'FCOSLoss'
        nms (object): Instance of 'MultiClassNMS'
        trt (bool): Whether to use trt in nms of deploy
    """
    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']
    __shared__ = ['num_classes', 'trt']

    def __init__(self,
                 num_classes=80,
                 fcos_feat='FCOSFeat',
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 multiply_strides_reg_targets=False,
                 norm_reg_targets=True,
                 centerness_on_reg=True,
                 num_shift=0.5,
                 sqrt_score=False,
                 fcos_loss='FCOSLoss',
                 nms='MultiClassNMS',
                 trt=False):
        super(FCOSHead, self).__init__()
        self.fcos_feat = fcos_feat
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.fcos_loss = fcos_loss
        self.norm_reg_targets = norm_reg_targets
        self.centerness_on_reg = centerness_on_reg
        self.multiply_strides_reg_targets = multiply_strides_reg_targets
        self.num_shift = num_shift
        self.nms = nms
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.sqrt_score = sqrt_score
        self.is_teacher = False

        conv_cls_name = "fcos_head_cls"
        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
        self.fcos_head_cls = self.add_sublayer(
            conv_cls_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=self.num_classes,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(
                    initializer=Constant(value=bias_init_value))))

        conv_reg_name = "fcos_head_reg"
        self.fcos_head_reg = self.add_sublayer(
            conv_reg_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=4,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        conv_centerness_name = "fcos_head_centerness"
        self.fcos_head_centerness = self.add_sublayer(
            conv_centerness_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=1,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        self.scales_regs = []
        for i in range(len(self.fpn_stride)):
            lvl = int(math.log(int(self.fpn_stride[i]), 2))
            feat_name = 'p{}_feat'.format(lvl)
            scale_reg = self.add_sublayer(feat_name, ScaleReg())
            self.scales_regs.append(scale_reg)

    def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5):
        """
        Compute locations of anchor points of each FPN layer
        Args:
            fpn_stride (int): The stride of current FPN feature map
            feature (Tensor): Tensor of current FPN feature map
        Return:
            Anchor points locations of current FPN feature map
        """
        h, w = feature.shape[2], feature.shape[3]
        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
        shift_x = paddle.unsqueeze(shift_x, axis=0)
        shift_y = paddle.unsqueeze(shift_y, axis=1)
        shift_x = paddle.expand(shift_x, shape=[h, w])
        shift_y = paddle.expand(shift_y, shape=[h, w])

        shift_x = paddle.reshape(shift_x, shape=[-1])
        shift_y = paddle.reshape(shift_y, shape=[-1])
        location = paddle.stack(
            [shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift)
        return location

    def forward(self, fpn_feats, targets=None):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"
        cls_logits_list = []
        bboxes_reg_list = []
        centerness_list = []
        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
                                                   self.fpn_stride, fpn_feats):
            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
            cls_logits = self.fcos_head_cls(fcos_cls_feat)
            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
            if self.centerness_on_reg:
                centerness = self.fcos_head_centerness(fcos_reg_feat)
            else:
                centerness = self.fcos_head_centerness(fcos_cls_feat)
            if self.norm_reg_targets:
                bbox_reg = F.relu(bbox_reg)
                if self.multiply_strides_reg_targets:
                    bbox_reg = bbox_reg * fpn_stride
                else:
                    if not self.training or targets.get(
                            'get_data',
                            False) or targets.get('is_teacher', False):
                        bbox_reg = bbox_reg * fpn_stride
            else:
                bbox_reg = paddle.exp(bbox_reg)
            cls_logits_list.append(cls_logits)
            bboxes_reg_list.append(bbox_reg)
            centerness_list.append(centerness)

        if targets is not None:
            self.is_teacher = targets.get('is_teacher', False)
            if self.is_teacher:
                return [cls_logits_list, bboxes_reg_list, centerness_list]

        if self.training and targets is not None:
            get_data = targets.get('get_data', False)
            if get_data:
                return [cls_logits_list, bboxes_reg_list, centerness_list]

            losses = {}
            fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list]
            losses_fcos = self.get_loss(fcos_head_outs, targets)
            losses.update(losses_fcos)

            total_loss = paddle.add_n(list(losses.values()))
            losses.update({'loss': total_loss})
            return losses
        else:
            # eval or infer
            locations_list = []
            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
                location = self._compute_locations_by_level(fpn_stride, feature,
                                                            self.num_shift)
                locations_list.append(location)

            fcos_head_outs = [
                locations_list, cls_logits_list, bboxes_reg_list,
                centerness_list
            ]
            return fcos_head_outs

    def get_loss(self, fcos_head_outs, targets):
        cls_logits, bboxes_reg, centerness = fcos_head_outs

        # get labels,reg_target,centerness
        tag_labels, tag_bboxes, tag_centerness = [], [], []
        for i in range(len(self.fpn_stride)):
            k_lbl = 'labels{}'.format(i)
            if k_lbl in targets:
                tag_labels.append(targets[k_lbl])
            k_box = 'reg_target{}'.format(i)
            if k_box in targets:
                tag_bboxes.append(targets[k_box])
            k_ctn = 'centerness{}'.format(i)
            if k_ctn in targets:
                tag_centerness.append(targets[k_ctn])

        losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness,
                                     tag_labels, tag_bboxes, tag_centerness)
        return losses_fcos

    def _post_process_by_level(self,
                               locations,
                               box_cls,
                               box_reg,
                               box_ctn,
                               sqrt_score=False):
        box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1])
        box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1])
        pred_scores = box_scores * box_centerness
        if sqrt_score:
            pred_scores = paddle.sqrt(pred_scores)

        box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1])
        box_reg_decoding = paddle.stack(
            [
                locations[:, 0] - box_reg_ch_last[:, :, 0],
                locations[:, 1] - box_reg_ch_last[:, :, 1],
                locations[:, 0] + box_reg_ch_last[:, :, 2],
                locations[:, 1] + box_reg_ch_last[:, :, 3]
            ],
            axis=1)
        pred_boxes = box_reg_decoding.transpose([0, 2, 1])

        return pred_scores, pred_boxes

    def post_process(self, fcos_head_outs, scale_factor):
        locations, cls_logits, bboxes_reg, centerness = fcos_head_outs
        pred_bboxes, pred_scores = [], []

        for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg,
                                      centerness):
            scores, boxes = self._post_process_by_level(pts, cls, reg, ctn,
                                                        self.sqrt_score)
            pred_scores.append(scores)
            pred_bboxes.append(boxes)
        pred_bboxes = paddle.concat(pred_bboxes, axis=1)
        pred_scores = paddle.concat(pred_scores, axis=1)

        # scale bbox to origin
        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
        scale_factor = paddle.concat(
            [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4])
        pred_bboxes /= scale_factor

        pred_scores = pred_scores.transpose([0, 2, 1])
        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
        return bbox_pred, bbox_num


@register
class FCOSHead_ARSL(FCOSHead):
    """
    FCOSHead of ARSL for semi-det(ssod)
    Args:
        fcos_feat (object): Instance of 'FCOSFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        fcos_loss (object): Instance of 'FCOSLoss'
        norm_reg_targets (bool): Normalization the regression target if true
        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
        nms (object): Instance of 'MultiClassNMS'
        trt (bool): Whether to use trt in nms of deploy
    """
    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']
    __shared__ = ['num_classes', 'trt']

    def __init__(self,
                 num_classes=80,
                 fcos_feat='FCOSFeat',
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 multiply_strides_reg_targets=False,
                 norm_reg_targets=True,
                 centerness_on_reg=True,
                 num_shift=0.5,
                 sqrt_score=False,
                 fcos_loss='FCOSLossMILC',
                 nms='MultiClassNMS',
                 trt=False):
        super(FCOSHead_ARSL, self).__init__()
        self.fcos_feat = fcos_feat
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.fcos_loss = fcos_loss
        self.norm_reg_targets = norm_reg_targets
        self.centerness_on_reg = centerness_on_reg
        self.multiply_strides_reg_targets = multiply_strides_reg_targets
        self.num_shift = num_shift
        self.nms = nms
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.sqrt_score = sqrt_score

        conv_cls_name = "fcos_head_cls"
        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
        self.fcos_head_cls = self.add_sublayer(
            conv_cls_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=self.num_classes,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(
                    initializer=Constant(value=bias_init_value))))

        conv_reg_name = "fcos_head_reg"
        self.fcos_head_reg = self.add_sublayer(
            conv_reg_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=4,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        conv_centerness_name = "fcos_head_centerness"
        self.fcos_head_centerness = self.add_sublayer(
            conv_centerness_name,
            nn.Conv2D(
                in_channels=256,
                out_channels=1,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        self.scales_regs = []
        for i in range(len(self.fpn_stride)):
            lvl = int(math.log(int(self.fpn_stride[i]), 2))
            feat_name = 'p{}_feat'.format(lvl)
            scale_reg = self.add_sublayer(feat_name, ScaleReg())
            self.scales_regs.append(scale_reg)

    def forward(self, fpn_feats, targets=None):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"
        cls_logits_list = []
        bboxes_reg_list = []
        centerness_list = []
        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
                                                   self.fpn_stride, fpn_feats):
            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
            cls_logits = self.fcos_head_cls(fcos_cls_feat)
            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
            if self.centerness_on_reg:
                centerness = self.fcos_head_centerness(fcos_reg_feat)
            else:
                centerness = self.fcos_head_centerness(fcos_cls_feat)
            if self.norm_reg_targets:
                bbox_reg = F.relu(bbox_reg)
                if not self.training:
                    bbox_reg = bbox_reg * fpn_stride
            else:
                bbox_reg = paddle.exp(bbox_reg)
            cls_logits_list.append(cls_logits)
            bboxes_reg_list.append(bbox_reg)
            centerness_list.append(centerness)

        if not self.training:
            locations_list = []
            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
                location = self._compute_locations_by_level(fpn_stride, feature)
                locations_list.append(location)

            return locations_list, cls_logits_list, bboxes_reg_list, centerness_list
        else:
            return cls_logits_list, bboxes_reg_list, centerness_list

    def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness):
        cls_logits, bboxes_reg, centerness = fcos_head_outs
        return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels,
                              tag_bboxes, tag_centerness)


================================================
FILE: ppdet/modeling/heads/fcosr_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from .fcos_head import ScaleReg
from ..initializer import bias_init_with_prob, constant_, normal_
from ..ops import get_act_fn, anchor_generator
from ..rbox_utils import box2corners
from ..losses import ProbIoULoss
import numpy as np

__all__ = ['FCOSRHead']


def trunc_div(a, b):
    ipt = paddle.divide(a, b)
    sign_ipt = paddle.sign(ipt)
    abs_ipt = paddle.abs(ipt)
    abs_ipt = paddle.floor(abs_ipt)
    out = paddle.multiply(sign_ipt, abs_ipt)
    return out


def fmod(a, b):
    return a - trunc_div(a, b) * b


def fmod_eval(a, b):
    return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=1,
                 groups=1,
                 padding=0,
                 norm_cfg={'name': 'gn',
                           'num_groups': 32},
                 act=None):
        super(ConvBNLayer, self).__init__()

        self.conv = nn.Conv2D(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias_attr=False)

        norm_type = norm_cfg['name']
        if norm_type in ['sync_bn', 'bn']:
            self.norm = nn.BatchNorm2D(
                ch_out,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        else:
            groups = norm_cfg.get('num_groups', 1)
            self.norm = nn.GroupNorm(
                num_groups=groups,
                num_channels=ch_out,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self.act = get_act_fn(act) if act is None or isinstance(act, (
            str, dict)) else act

    def forward(self, x):
        x = self.conv(x)
        x = self.norm(x)
        x = self.act(x)

        return x


@register
class FCOSRHead(nn.Layer):
    """ FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details """

    __shared__ = ['num_classes', 'trt']
    __inject__ = ['assigner', 'nms']

    def __init__(self,
                 num_classes=15,
                 in_channels=256,
                 feat_channels=256,
                 stacked_convs=4,
                 act='relu',
                 fpn_strides=[4, 8, 16, 32, 64],
                 trt=False,
                 loss_weight={'class': 1.0,
                              'probiou': 1.0},
                 norm_cfg={'name': 'gn',
                           'num_groups': 32},
                 assigner='FCOSRAssigner',
                 nms='MultiClassNMS'):

        super(FCOSRHead, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.fpn_strides = fpn_strides
        self.stacked_convs = stacked_convs
        self.loss_weight = loss_weight
        self.half_pi = paddle.to_tensor(
            [1.5707963267948966], dtype=paddle.float32)
        self.probiou_loss = ProbIoULoss(mode='l1')
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        self.trt = trt
        self.loss_weight = loss_weight
        self.assigner = assigner
        self.nms = nms
        # stem
        self.stem_cls = nn.LayerList()
        self.stem_reg = nn.LayerList()
        for i in range(self.stacked_convs):
            self.stem_cls.append(
                ConvBNLayer(
                    self.in_channels[i],
                    feat_channels,
                    filter_size=3,
                    stride=1,
                    padding=1,
                    norm_cfg=norm_cfg,
                    act=act))
            self.stem_reg.append(
                ConvBNLayer(
                    self.in_channels[i],
                    feat_channels,
                    filter_size=3,
                    stride=1,
                    padding=1,
                    norm_cfg=norm_cfg,
                    act=act))

        self.scales = nn.LayerList(
            [ScaleReg() for _ in range(len(fpn_strides))])

        # prediction
        self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1)

        self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1)

        self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1)

        self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1)

        self._init_weights()

    def _init_weights(self):
        for cls_, reg_ in zip(self.stem_cls, self.stem_reg):
            normal_(cls_.conv.weight, std=0.01)
            normal_(reg_.conv.weight, std=0.01)

        bias_cls = bias_init_with_prob(0.01)
        normal_(self.pred_cls.weight, std=0.01)
        constant_(self.pred_cls.bias, bias_cls)
        normal_(self.pred_xy.weight, std=0.01)
        normal_(self.pred_wh.weight, std=0.01)
        normal_(self.pred_angle.weight, std=0.01)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def _generate_anchors(self, feats):
        if self.trt:
            anchor_points = []
            for feat, stride in zip(feats, self.fpn_strides):
                _, _, h, w = feat.shape
                anchor, _ = anchor_generator(
                    feat,
                    stride * 4,
                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
                    offset=0.5)
                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
                xc = (x1 + x2 + 1) / 2
                yc = (y1 + y2 + 1) / 2
                anchor_point = paddle.concat(
                    [xc, yc], axis=-1).reshape((1, h * w, 2))
                anchor_points.append(anchor_point)
            anchor_points = paddle.concat(anchor_points, axis=1)
            return anchor_points, None, None
        else:
            anchor_points = []
            stride_tensor = []
            num_anchors_list = []
            for feat, stride in zip(feats, self.fpn_strides):
                _, _, h, w = feat.shape
                shift_x = (paddle.arange(end=w) + 0.5) * stride
                shift_y = (paddle.arange(end=h) + 0.5) * stride
                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
                anchor_point = paddle.cast(
                    paddle.stack(
                        [shift_x, shift_y], axis=-1), dtype='float32')
                anchor_points.append(anchor_point.reshape([1, -1, 2]))
                stride_tensor.append(
                    paddle.full(
                        [1, h * w, 1], stride, dtype='float32'))
                num_anchors_list.append(h * w)
            anchor_points = paddle.concat(anchor_points, axis=1)
            stride_tensor = paddle.concat(stride_tensor, axis=1)
            return anchor_points, stride_tensor, num_anchors_list

    def forward(self, feats, target=None):
        if self.training:
            return self.forward_train(feats, target)
        else:
            return self.forward_eval(feats, target)

    def forward_train(self, feats, target=None):
        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
            feats)
        cls_pred_list, reg_pred_list = [], []
        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
            # cls
            cls_feat = feat
            for cls_layer in self.stem_cls:
                cls_feat = cls_layer(cls_feat)
            cls_pred = F.sigmoid(self.pred_cls(cls_feat))
            cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1)))
            # reg
            reg_feat = feat
            for reg_layer in self.stem_reg:
                reg_feat = reg_layer(reg_feat)

            reg_xy = scale(self.pred_xy(reg_feat)) * stride
            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
            reg_angle = self.pred_angle(reg_feat)
            reg_angle = fmod(reg_angle, self.half_pi)
            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
            reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1)))

        cls_pred_list = paddle.concat(cls_pred_list, axis=1)
        reg_pred_list = paddle.concat(reg_pred_list, axis=1)

        return self.get_loss([
            cls_pred_list, reg_pred_list, anchor_points, stride_tensor,
            num_anchors_list
        ], target)

    def forward_eval(self, feats, target=None):
        cls_pred_list, reg_pred_list = [], []
        anchor_points, _, _ = self._generate_anchors(feats)
        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
            b, _, h, w = feat.shape
            # cls
            cls_feat = feat
            for cls_layer in self.stem_cls:
                cls_feat = cls_layer(cls_feat)
            cls_pred = F.sigmoid(self.pred_cls(cls_feat))
            cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w]))
            # reg
            reg_feat = feat
            for reg_layer in self.stem_reg:
                reg_feat = reg_layer(reg_feat)

            reg_xy = scale(self.pred_xy(reg_feat)) * stride
            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
            reg_angle = self.pred_angle(reg_feat)
            reg_angle = fmod_eval(reg_angle, self.half_pi)
            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
            reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1))
            reg_pred_list.append(reg_pred)

        cls_pred_list = paddle.concat(cls_pred_list, axis=2)
        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
        reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list)
        return cls_pred_list, reg_pred_list

    def _bbox_decode(self, points, reg_pred_list):
        xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1)
        xy = xy + points
        return paddle.concat([xy, wha], axis=-1)

    def _box2corners(self, pred_bboxes):
        """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)

        Args:
            pred_bboxes (Tensor): [B, N, 5]
        
        Returns:
            polys (Tensor): [B, N, 8]
        """
        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
        cos_a_half = paddle.cos(angle) * 0.5
        sin_a_half = paddle.sin(angle) * 0.5
        w_x = cos_a_half * w
        w_y = sin_a_half * w
        h_x = -sin_a_half * h
        h_y = cos_a_half * h
        return paddle.concat(
            [
                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
            ],
            axis=-1)

    def get_loss(self, head_outs, gt_meta):
        cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs
        gt_labels = gt_meta['gt_class']
        gt_bboxes = gt_meta['gt_bbox']
        gt_rboxes = gt_meta['gt_rbox']
        pad_gt_mask = gt_meta['pad_gt_mask']
        # decode
        pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list)
        # label assignment
        assigned_labels, assigned_rboxes, assigned_scores = \
            self.assigner(
                anchor_points,
                stride_tensor,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                gt_rboxes,
                pad_gt_mask,
                self.num_classes,
                pred_rboxes
            )

        # reg_loss
        mask_positive = (assigned_labels != self.num_classes)
        num_pos = mask_positive.sum().item()
        if num_pos > 0:
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
            pred_rboxes_pos = paddle.masked_select(pred_rboxes,
                                                   bbox_mask).reshape([-1, 5])
            assigned_rboxes_pos = paddle.masked_select(
                assigned_rboxes, bbox_mask).reshape([-1, 5])
            bbox_weight = paddle.masked_select(
                assigned_scores.sum(-1), mask_positive).reshape([-1])
            avg_factor = bbox_weight.sum()
            loss_probiou = self.probiou_loss(pred_rboxes_pos,
                                             assigned_rboxes_pos)
            loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor
        else:
            loss_probiou = pred_rboxes.sum() * 0.

        avg_factor = max(num_pos, 1.0)
        # cls_loss
        loss_cls = self._qfocal_loss(
            cls_pred_list, assigned_scores, reduction='sum')
        loss_cls = loss_cls / avg_factor

        loss = self.loss_weight['class'] * loss_cls + \
               self.loss_weight['probiou'] * loss_probiou
        out_dict = {
            'loss': loss,
            'loss_probiou': loss_probiou,
            'loss_cls': loss_cls
        }
        return out_dict

    @staticmethod
    def _qfocal_loss(score, label, gamma=2.0, reduction='sum'):
        weight = (score - label).pow(gamma)
        loss = F.binary_cross_entropy(
            score, label, weight=weight, reduction=reduction)
        return loss

    def post_process(self, head_outs, scale_factor):
        pred_scores, pred_rboxes = head_outs
        # [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8]
        pred_rboxes = self._box2corners(pred_rboxes)
        # scale bbox to origin
        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
        scale_factor = paddle.concat(
            [
                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
                scale_y
            ],
            axis=-1).reshape([-1, 1, 8])
        pred_rboxes /= scale_factor
        bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes,
                                                           pred_scores)
        return bbox_pred, bbox_num, before_nms_indexes


================================================
FILE: ppdet/modeling/heads/gfl_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant

from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox
from ppdet.data.transform.atss_assigner import bbox_overlaps

__all__ = ['GFLHead', 'LDGFLHead']


class ScaleReg(nn.Layer):
    """
    Parameter for scaling the regression outputs.
    """

    def __init__(self):
        super(ScaleReg, self).__init__()
        self.scale_reg = self.create_parameter(
            shape=[1],
            attr=ParamAttr(initializer=Constant(value=1.)),
            dtype="float32")

    def forward(self, inputs):
        out = inputs * self.scale_reg
        return out


class Integral(nn.Layer):
    """A fixed layer for calculating integral result from distribution.
    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
    P(y_i) denotes the softmax vector that represents the discrete distribution
    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
    Args:
        reg_max (int): The maximal value of the discrete set. Default: 16. You
            may want to reset it according to your new dataset or related
            settings.
    """

    def __init__(self, reg_max=16):
        super(Integral, self).__init__()
        self.reg_max = reg_max
        self.register_buffer('project',
                             paddle.linspace(0, self.reg_max, self.reg_max + 1))

    def forward(self, x):
        """Forward feature from the regression head to get integral result of
        bounding box location.
        Args:
            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
                n is self.reg_max.
        Returns:
            x (Tensor): Integral result of box locations, i.e., distance
                offsets from the box center in four directions, shape (N, 4).
        """
        x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1)
        x = F.linear(x, self.project)
        if self.training:
            x = x.reshape([-1, 4])
        return x


@register
class DGQP(nn.Layer):
    """Distribution-Guided Quality Predictor of GFocal head
    Args:
        reg_topk (int): top-k statistics of distribution to guide LQE
        reg_channels (int): hidden layer unit to generate LQE
        add_mean (bool): Whether to calculate the mean of top-k statistics
    """

    def __init__(self, reg_topk=4, reg_channels=64, add_mean=True):
        super(DGQP, self).__init__()
        self.reg_topk = reg_topk
        self.reg_channels = reg_channels
        self.add_mean = add_mean
        self.total_dim = reg_topk
        if add_mean:
            self.total_dim += 1
        self.reg_conv1 = self.add_sublayer(
            'dgqp_reg_conv1',
            nn.Conv2D(
                in_channels=4 * self.total_dim,
                out_channels=self.reg_channels,
                kernel_size=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))
        self.reg_conv2 = self.add_sublayer(
            'dgqp_reg_conv2',
            nn.Conv2D(
                in_channels=self.reg_channels,
                out_channels=1,
                kernel_size=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

    def forward(self, x):
        """Forward feature from the regression head to get integral result of
        bounding box location.
        Args:
            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
                n is self.reg_max.
        Returns:
            x (Tensor): Integral result of box locations, i.e., distance
                offsets from the box center in four directions, shape (N, 4).
        """
        N, _, H, W = x.shape[:]
        prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2)
        prob_topk, _ = prob.topk(self.reg_topk, axis=2)
        if self.add_mean:
            stat = paddle.concat(
                [prob_topk, prob_topk.mean(
                    axis=2, keepdim=True)], axis=2)
        else:
            stat = prob_topk
        y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W])))
        y = F.sigmoid(self.reg_conv2(y))
        return y


@register
class GFLHead(nn.Layer):
    """
    GFLHead
    Args:
        conv_feat (object): Instance of 'FCOSFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_class (object): Instance of QualityFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 16.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms'
    ]
    __shared__ = ['num_classes']

    def __init__(self,
                 conv_feat='FCOSFeat',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 loss_class='QualityFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 reg_max=16,
                 feat_in_chan=256,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0):
        super(GFLHead, self).__init__()
        self.conv_feat = conv_feat
        self.dgqp_module = dgqp_module
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.loss_qfl = loss_class
        self.loss_dfl = loss_dfl
        self.loss_bbox = loss_bbox
        self.reg_max = reg_max
        self.feat_in_chan = feat_in_chan
        self.nms = nms
        self.nms_pre = nms_pre
        self.cell_offset = cell_offset
        self.use_sigmoid = self.loss_qfl.use_sigmoid
        if self.use_sigmoid:
            self.cls_out_channels = self.num_classes
        else:
            self.cls_out_channels = self.num_classes + 1

        conv_cls_name = "gfl_head_cls"
        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
        self.gfl_head_cls = self.add_sublayer(
            conv_cls_name,
            nn.Conv2D(
                in_channels=self.feat_in_chan,
                out_channels=self.cls_out_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(
                    initializer=Constant(value=bias_init_value))))

        conv_reg_name = "gfl_head_reg"
        self.gfl_head_reg = self.add_sublayer(
            conv_reg_name,
            nn.Conv2D(
                in_channels=self.feat_in_chan,
                out_channels=4 * (self.reg_max + 1),
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        self.scales_regs = []
        for i in range(len(self.fpn_stride)):
            lvl = int(math.log(int(self.fpn_stride[i]), 2))
            feat_name = 'p{}_feat'.format(lvl)
            scale_reg = self.add_sublayer(feat_name, ScaleReg())
            self.scales_regs.append(scale_reg)

        self.distribution_project = Integral(self.reg_max)

    def forward(self, fpn_feats):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"
        cls_logits_list = []
        bboxes_reg_list = []
        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
                                               self.scales_regs, fpn_feats):
            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
            cls_score = self.gfl_head_cls(conv_cls_feat)
            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
            if self.dgqp_module:
                quality_score = self.dgqp_module(bbox_pred)
                cls_score = F.sigmoid(cls_score) * quality_score
            if not self.training:
                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
                b, cell_h, cell_w, _ = cls_score.shape
                y, x = self.get_single_level_center_point(
                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)
                center_points = paddle.stack([x, y], axis=-1)
                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
                bbox_pred = self.distribution_project(bbox_pred) * stride
                bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4])

                # NOTE: If keep_ratio=False and image shape value that
                # multiples of 32, distance2bbox not set max_shapes parameter
                # to speed up model prediction. If need to set max_shapes,
                # please use inputs['im_shape'].
                bbox_pred = batch_distance2bbox(
                    center_points, bbox_pred, max_shapes=None)

            cls_logits_list.append(cls_score)
            bboxes_reg_list.append(bbox_pred)

        return (cls_logits_list, bboxes_reg_list)

    def _images_to_levels(self, target, num_level_anchors):
        """
        Convert targets by image to targets by feature level.
        """
        level_targets = []
        start = 0
        for n in num_level_anchors:
            end = start + n
            level_targets.append(target[:, start:end].squeeze(0))
            start = end
        return level_targets

    def _grid_cells_to_center(self, grid_cells):
        """
        Get center location of each gird cell
        Args:
            grid_cells: grid cells of a feature map
        Returns:
            center points
        """
        cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2
        cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2
        return paddle.stack([cells_cx, cells_cy], axis=-1)

    def get_loss(self, gfl_head_outs, gt_meta):
        cls_logits, bboxes_reg = gfl_head_outs
        num_level_anchors = [
            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
        ]
        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
                                                 num_level_anchors)
        labels_list = self._images_to_levels(gt_meta['labels'],
                                             num_level_anchors)
        label_weights_list = self._images_to_levels(gt_meta['label_weights'],
                                                    num_level_anchors)
        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
                                                   num_level_anchors)
        num_total_pos = sum(gt_meta['pos_num'])
        try:
            paddle.distributed.all_reduce(num_total_pos)
            num_total_pos = paddle.clip(
                num_total_pos / paddle.distributed.get_world_size(), min=1)
        except:
            num_total_pos = max(num_total_pos, 1)

        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip(
                cls_logits, bboxes_reg, grid_cells_list, labels_list,
                label_weights_list, bbox_targets_list, self.fpn_stride):
            grid_cells = grid_cells.reshape([-1, 4])
            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
                [-1, self.cls_out_channels])
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [-1, 4 * (self.reg_max + 1)])
            bbox_targets = bbox_targets.reshape([-1, 4])
            labels = labels.reshape([-1])
            label_weights = label_weights.reshape([-1])

            bg_class_ind = self.num_classes
            pos_inds = paddle.nonzero(
                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
                as_tuple=False).squeeze(1)
            score = np.zeros(labels.shape)
            if len(pos_inds) > 0:
                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)
                pos_grid_cell_centers = self._grid_cells_to_center(
                    pos_grid_cells) / stride

                weight_targets = F.sigmoid(cls_score.detach())
                weight_targets = paddle.gather(
                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
                                                     pos_bbox_pred_corners)
                pos_decode_bbox_targets = pos_bbox_targets / stride
                bbox_iou = bbox_overlaps(
                    pos_decode_bbox_pred.detach().numpy(),
                    pos_decode_bbox_targets.detach().numpy(),
                    is_aligned=True)
                score[pos_inds.numpy()] = bbox_iou
                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
                target_corners = bbox2distance(pos_grid_cell_centers,
                                               pos_decode_bbox_targets,
                                               self.reg_max).reshape([-1])
                # regression loss
                loss_bbox = paddle.sum(
                    self.loss_bbox(pos_decode_bbox_pred,
                                   pos_decode_bbox_targets) * weight_targets)

                # dfl loss
                loss_dfl = self.loss_dfl(
                    pred_corners,
                    target_corners,
                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=4.0)
            else:
                loss_bbox = bbox_pred.sum() * 0
                loss_dfl = bbox_pred.sum() * 0
                weight_targets = paddle.to_tensor([0], dtype='float32')

            # qfl loss
            score = paddle.to_tensor(score)
            loss_qfl = self.loss_qfl(
                cls_score, (labels, score),
                weight=label_weights,
                avg_factor=num_total_pos)
            loss_bbox_list.append(loss_bbox)
            loss_dfl_list.append(loss_dfl)
            loss_qfl_list.append(loss_qfl)
            avg_factor.append(weight_targets.sum())

        avg_factor = sum(avg_factor)
        try:
            paddle.distributed.all_reduce(avg_factor)
            avg_factor = paddle.clip(
                avg_factor / paddle.distributed.get_world_size(), min=1)
        except:
            avg_factor = max(avg_factor.item(), 1)
        if avg_factor <= 0:
            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_bbox = paddle.to_tensor(
                0, dtype='float32', stop_gradient=False)
            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
        else:
            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
            loss_qfl = sum(loss_qfl_list)
            loss_bbox = sum(losses_bbox)
            loss_dfl = sum(losses_dfl)

        loss_states = dict(
            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)

        return loss_states

    def get_single_level_center_point(self, featmap_size, stride,
                                      cell_offset=0):
        """
        Generate pixel centers of a single stage feature map.
        Args:
            featmap_size: height and width of the feature map
            stride: down sample stride of the feature map
        Returns:
            y and x of the center points
        """
        h, w = featmap_size
        x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride
        y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride
        y, x = paddle.meshgrid(y_range, x_range)
        y = y.flatten()
        x = x.flatten()
        return y, x

    def post_process(self, gfl_head_outs, im_shape, scale_factor):
        cls_scores, bboxes_reg = gfl_head_outs
        bboxes = paddle.concat(bboxes_reg, axis=1)
        # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
        im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
        bboxes /= im_scale
        mlvl_scores = paddle.concat(cls_scores, axis=1)
        mlvl_scores = mlvl_scores.transpose([0, 2, 1])
        bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores)
        return bbox_pred, bbox_num


@register
class LDGFLHead(GFLHead):
    """
    GFLHead for LD distill
    Args:
        conv_feat (object): Instance of 'FCOSFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_class (object): Instance of QualityFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 16.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms'
    ]
    __shared__ = ['num_classes']

    def __init__(self,
                 conv_feat='FCOSFeat',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 loss_class='QualityFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 loss_ld='KnowledgeDistillationKLDivLoss',
                 loss_ld_vlr='KnowledgeDistillationKLDivLoss',
                 loss_kd='KnowledgeDistillationKLDivLoss',
                 reg_max=16,
                 feat_in_chan=256,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0):

        super(LDGFLHead, self).__init__(
            conv_feat=conv_feat,
            dgqp_module=dgqp_module,
            num_classes=num_classes,
            fpn_stride=fpn_stride,
            prior_prob=prior_prob,
            loss_class=loss_class,
            loss_dfl=loss_dfl,
            loss_bbox=loss_bbox,
            reg_max=reg_max,
            feat_in_chan=feat_in_chan,
            nms=nms,
            nms_pre=nms_pre,
            cell_offset=cell_offset)
        self.loss_ld = loss_ld
        self.loss_kd = loss_kd
        self.loss_ld_vlr = loss_ld_vlr

    def forward(self, fpn_feats):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"
        cls_logits_list = []
        bboxes_reg_list = []
        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
                                               self.scales_regs, fpn_feats):
            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
            cls_score = self.gfl_head_cls(conv_cls_feat)
            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))

            if self.dgqp_module:
                quality_score = self.dgqp_module(bbox_pred)
                cls_score = F.sigmoid(cls_score) * quality_score
            if not self.training:
                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
                b, cell_h, cell_w, _ = cls_score.shape
                y, x = self.get_single_level_center_point(
                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)
                center_points = paddle.stack([x, y], axis=-1)
                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
                bbox_pred = self.distribution_project(bbox_pred) * stride
                bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])

                # NOTE: If keep_ratio=False and image shape value that
                # multiples of 32, distance2bbox not set max_shapes parameter
                # to speed up model prediction. If need to set max_shapes,
                # please use inputs['im_shape'].
                bbox_pred = batch_distance2bbox(
                    center_points, bbox_pred, max_shapes=None)

            cls_logits_list.append(cls_score)
            bboxes_reg_list.append(bbox_pred)

        return (cls_logits_list, bboxes_reg_list)

    def get_loss(self, gfl_head_outs, gt_meta, soft_label_list,
                 soft_targets_list):
        cls_logits, bboxes_reg = gfl_head_outs

        num_level_anchors = [
            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
        ]

        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
                                                 num_level_anchors)

        labels_list = self._images_to_levels(gt_meta['labels'],
                                             num_level_anchors)

        label_weights_list = self._images_to_levels(gt_meta['label_weights'],
                                                    num_level_anchors)
        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
                                                   num_level_anchors)
        # vlr regions                                         
        vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'],
                                                  num_level_anchors)

        num_total_pos = sum(gt_meta['pos_num'])
        try:
            paddle.distributed.all_reduce(num_total_pos)
            num_total_pos = paddle.clip(
                num_total_pos / paddle.distributed.get_world_size(), min=1.)
        except:
            num_total_pos = max(num_total_pos, 1)

        loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], []
        loss_ld_vlr_list, loss_kd_list = [], []

        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\
                soft_label, vlr_region in zip(
                cls_logits, bboxes_reg, grid_cells_list, labels_list,
                label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list,
                soft_label_list, vlr_regions_list):

            grid_cells = grid_cells.reshape([-1, 4])
            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
                [-1, self.cls_out_channels])
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [-1, 4 * (self.reg_max + 1)])

            soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape(
                [-1, 4 * (self.reg_max + 1)])

            soft_label = soft_label.transpose([0, 2, 3, 1]).reshape(
                [-1, self.cls_out_channels])

            # feture im
            # teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256])
            # x = x.transpose([0, 2, 3, 1]).reshape([-1, 256])  

            bbox_targets = bbox_targets.reshape([-1, 4])
            labels = labels.reshape([-1])
            label_weights = label_weights.reshape([-1])

            vlr_region = vlr_region.reshape([-1])

            bg_class_ind = self.num_classes
            pos_inds = paddle.nonzero(
                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
                as_tuple=False).squeeze(1)
            score = np.zeros(labels.shape)

            remain_inds = (vlr_region > 0).nonzero()

            if len(pos_inds) > 0:
                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)

                pos_grid_cell_centers = self._grid_cells_to_center(
                    pos_grid_cells) / stride

                weight_targets = F.sigmoid(cls_score.detach())
                weight_targets = paddle.gather(
                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
                                                     pos_bbox_pred_corners)
                pos_decode_bbox_targets = pos_bbox_targets / stride
                bbox_iou = bbox_overlaps(
                    pos_decode_bbox_pred.detach().numpy(),
                    pos_decode_bbox_targets.detach().numpy(),
                    is_aligned=True)
                score[pos_inds.numpy()] = bbox_iou
                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])

                pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0)
                soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1])

                target_corners = bbox2distance(pos_grid_cell_centers,
                                               pos_decode_bbox_targets,
                                               self.reg_max).reshape([-1])
                # regression loss
                loss_bbox = paddle.sum(
                    self.loss_bbox(pos_decode_bbox_pred,
                                   pos_decode_bbox_targets) * weight_targets)

                # dfl loss
                loss_dfl = self.loss_dfl(
                    pred_corners,
                    target_corners,
                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=4.0)

                # ld loss
                loss_ld = self.loss_ld(
                    pred_corners,
                    soft_corners,
                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=4.0)

                loss_kd = self.loss_kd(
                    paddle.gather(
                        cls_score, pos_inds, axis=0),
                    paddle.gather(
                        soft_label, pos_inds, axis=0),
                    weight=paddle.gather(
                        label_weights, pos_inds, axis=0),
                    avg_factor=pos_inds.shape[0])

            else:
                loss_bbox = bbox_pred.sum() * 0
                loss_dfl = bbox_pred.sum() * 0
                loss_ld = bbox_pred.sum() * 0
                loss_kd = bbox_pred.sum() * 0
                weight_targets = paddle.to_tensor([0], dtype='float32')

            if len(remain_inds) > 0:
                neg_pred_corners = bbox_pred[remain_inds].reshape(
                    [-1, self.reg_max + 1])
                neg_soft_corners = soft_targets[remain_inds].reshape(
                    [-1, self.reg_max + 1])

                remain_targets = vlr_region[remain_inds]

                loss_ld_vlr = self.loss_ld_vlr(
                    neg_pred_corners,
                    neg_soft_corners,
                    weight=remain_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=16.0)
            else:
                loss_ld_vlr = bbox_pred.sum() * 0

            # qfl loss
            score = paddle.to_tensor(score)
            loss_qfl = self.loss_qfl(
                cls_score, (labels, score),
                weight=label_weights,
                avg_factor=num_total_pos)

            loss_bbox_list.append(loss_bbox)
            loss_dfl_list.append(loss_dfl)
            loss_qfl_list.append(loss_qfl)
            loss_ld_list.append(loss_ld)
            loss_ld_vlr_list.append(loss_ld_vlr)
            loss_kd_list.append(loss_kd)
            avg_factor.append(weight_targets.sum())

        avg_factor = sum(avg_factor)  # + 1e-6
        try:
            paddle.distributed.all_reduce(avg_factor)
            avg_factor = paddle.clip(
                avg_factor / paddle.distributed.get_world_size(), min=1)
        except:
            avg_factor = max(avg_factor.item(), 1)

        if avg_factor <= 0:
            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_bbox = paddle.to_tensor(
                0, dtype='float32', stop_gradient=False)
            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_ld_vlr = paddle.to_tensor(
                0, dtype='float32', stop_gradient=False)
            loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
        else:
            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
            loss_qfl = sum(loss_qfl_list)
            loss_bbox = sum(losses_bbox)
            loss_dfl = sum(losses_dfl)
            loss_ld = sum(loss_ld_list)
            loss_ld_vlr = sum(loss_ld_vlr_list)
            loss_kd = sum(loss_kd_list)

        loss_states = dict(
            loss_qfl=loss_qfl,
            loss_bbox=loss_bbox,
            loss_dfl=loss_dfl,
            loss_ld=loss_ld,
            loss_ld_vlr=loss_ld_vlr,
            loss_kd=loss_kd)

        return loss_states


================================================
FILE: ppdet/modeling/heads/keypoint_hrhrnet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn

from ppdet.core.workspace import register
from .. import layers as L
from ..backbones.hrnet import BasicBlock


@register
class HrHRNetHead(nn.Layer):
    __inject__ = ['loss']

    def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32):
        """
        Head for HigherHRNet network

        Args:
            num_joints (int): number of keypoints
            hrloss (object): HrHRNetLoss instance
            swahr (bool): whether to use swahr
            width (int): hrnet channel width
        """
        super(HrHRNetHead, self).__init__()
        self.loss = loss

        self.num_joints = num_joints
        num_featout1 = num_joints * 2
        num_featout2 = num_joints
        self.swahr = swahr
        self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True)
        self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True)
        self.deconv = nn.Sequential(
            L.ConvTranspose2d(
                num_featout1 + width, width, 4, 2, 1, 0, bias=False),
            L.BatchNorm2d(width),
            L.ReLU())
        self.blocks = nn.Sequential(*(BasicBlock(
            num_channels=width,
            num_filters=width,
            has_se=False,
            freeze_norm=False,
            name='HrHRNetHead_{}'.format(i)) for i in range(4)))

        self.interpolate = L.Upsample(2, mode='bilinear')
        self.concat = L.Concat(dim=1)
        if swahr:
            self.scalelayer0 = nn.Sequential(
                L.Conv2d(
                    width, num_joints, 1, 1, 0, bias=True),
                L.BatchNorm2d(num_joints),
                L.ReLU(),
                L.Conv2d(
                    num_joints,
                    num_joints,
                    9,
                    1,
                    4,
                    groups=num_joints,
                    bias=True))
            self.scalelayer1 = nn.Sequential(
                L.Conv2d(
                    width, num_joints, 1, 1, 0, bias=True),
                L.BatchNorm2d(num_joints),
                L.ReLU(),
                L.Conv2d(
                    num_joints,
                    num_joints,
                    9,
                    1,
                    4,
                    groups=num_joints,
                    bias=True))

    def forward(self, feats, targets=None):
        x1 = feats[0]
        xo1 = self.conv1(x1)
        x2 = self.blocks(self.deconv(self.concat((x1, xo1))))
        xo2 = self.conv2(x2)
        num_joints = self.num_joints
        if self.training:
            heatmap1, tagmap = paddle.split(xo1, 2, axis=1)
            if self.swahr:
                so1 = self.scalelayer0(x1)
                so2 = self.scalelayer1(x2)
                hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap)
                return self.loss(hrhrnet_outputs, targets)
            else:
                hrhrnet_outputs = (heatmap1, xo2, tagmap)
                return self.loss(hrhrnet_outputs, targets)

        # averaged heatmap, upsampled tagmap
        upsampled = self.interpolate(xo1)
        avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2
        return avg, upsampled[:, num_joints:]


================================================
FILE: ppdet/modeling/heads/mask_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal

from ppdet.core.workspace import register, create
from ppdet.modeling.layers import ConvNormLayer
from .roi_extractor import RoIAlign
from ..cls_utils import _get_class_default_kwargs


@register
class MaskFeat(nn.Layer):
    """
    Feature extraction in Mask head

    Args:
        in_channel (int): Input channels
        out_channel (int): Output channels
        num_convs (int): The number of conv layers, default 4
        norm_type (string | None): Norm type, bn, gn, sync_bn are available,
            default None
    """

    def __init__(self,
                 in_channel=256,
                 out_channel=256,
                 num_convs=4,
                 norm_type=None):
        super(MaskFeat, self).__init__()
        self.num_convs = num_convs
        self.in_channel = in_channel
        self.out_channel = out_channel
        self.norm_type = norm_type
        fan_conv = out_channel * 3 * 3
        fan_deconv = out_channel * 2 * 2

        mask_conv = nn.Sequential()
        if norm_type == 'gn':
            for i in range(self.num_convs):
                conv_name = 'mask_inter_feat_{}'.format(i + 1)
                mask_conv.add_sublayer(
                    conv_name,
                    ConvNormLayer(
                        ch_in=in_channel if i == 0 else out_channel,
                        ch_out=out_channel,
                        filter_size=3,
                        stride=1,
                        norm_type=self.norm_type,
                        initializer=KaimingNormal(fan_in=fan_conv),
                        skip_quant=True))
                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
        else:
            for i in range(self.num_convs):
                conv_name = 'mask_inter_feat_{}'.format(i + 1)
                conv = nn.Conv2D(
                    in_channels=in_channel if i == 0 else out_channel,
                    out_channels=out_channel,
                    kernel_size=3,
                    padding=1,
                    weight_attr=paddle.ParamAttr(
                        initializer=KaimingNormal(fan_in=fan_conv)))
                conv.skip_quant = True
                mask_conv.add_sublayer(conv_name, conv)
                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
        mask_conv.add_sublayer(
            'conv5_mask',
            nn.Conv2DTranspose(
                in_channels=self.out_channel if num_convs > 0 else self.in_channel,
                out_channels=self.out_channel,
                kernel_size=2,
                stride=2,
                weight_attr=paddle.ParamAttr(
                    initializer=KaimingNormal(fan_in=fan_deconv))))
        mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU())
        self.upsample = mask_conv

    @classmethod
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channel': input_shape.channels, }

    def out_channels(self):
        return self.out_channel

    def forward(self, feats):
        return self.upsample(feats)


@register
class MaskHead(nn.Layer):
    __shared__ = ['num_classes', 'export_onnx']
    __inject__ = ['mask_assigner']
    """
    RCNN mask head

    Args:
        head (nn.Layer): Extract feature in mask head
        roi_extractor (object): The module of RoI Extractor
        mask_assigner (object): The module of Mask Assigner, 
            label and sample the mask
        num_classes (int): The number of classes
        share_bbox_feat (bool): Whether to share the feature from bbox head,
            default false
    """

    def __init__(self,
                 head,
                 roi_extractor=_get_class_default_kwargs(RoIAlign),
                 mask_assigner='MaskAssigner',
                 num_classes=80,
                 share_bbox_feat=False,
                 export_onnx=False):
        super(MaskHead, self).__init__()
        self.num_classes = num_classes
        self.export_onnx = export_onnx

        self.roi_extractor = roi_extractor
        if isinstance(roi_extractor, dict):
            self.roi_extractor = RoIAlign(**roi_extractor)
        self.head = head
        self.in_channels = head.out_channels()
        self.mask_assigner = mask_assigner
        self.share_bbox_feat = share_bbox_feat
        self.bbox_head = None

        self.mask_fcn_logits = nn.Conv2D(
            in_channels=self.in_channels,
            out_channels=self.num_classes,
            kernel_size=1,
            weight_attr=paddle.ParamAttr(initializer=KaimingNormal(
                fan_in=self.num_classes)))
        self.mask_fcn_logits.skip_quant = True

    @classmethod
    def from_config(cls, cfg, input_shape):
        roi_pooler = cfg['roi_extractor']
        assert isinstance(roi_pooler, dict)
        kwargs = RoIAlign.from_config(cfg, input_shape)
        roi_pooler.update(kwargs)
        kwargs = {'input_shape': input_shape}
        head = create(cfg['head'], **kwargs)
        return {
            'roi_extractor': roi_pooler,
            'head': head,
        }

    def get_loss(self, mask_logits, mask_label, mask_target, mask_weight):
        mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3])
        mask_label = paddle.expand_as(mask_label, mask_logits)
        mask_label.stop_gradient = True
        mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
        shape = mask_logits.shape
        mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]])

        mask_target = mask_target.cast('float32')
        mask_weight = mask_weight.unsqueeze([1, 2])
        loss_mask = F.binary_cross_entropy_with_logits(
            mask_pred, mask_target, weight=mask_weight, reduction="mean")
        return loss_mask

    def forward_train(self, body_feats, rois, rois_num, inputs, targets,
                      bbox_feat):
        """
        body_feats (list[Tensor]): Multi-level backbone features
        rois (list[Tensor]): Proposals for each batch with shape [N, 4]
        rois_num (Tensor): The number of proposals for each batch
        inputs (dict): ground truth info
        """
        tgt_labels, _, tgt_gt_inds = targets
        rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
            rois, tgt_labels, tgt_gt_inds, inputs)

        if self.share_bbox_feat:
            rois_feat = paddle.gather(bbox_feat, mask_index)
        else:
            rois_feat = self.roi_extractor(body_feats, rois, rois_num)
        mask_feat = self.head(rois_feat)
        mask_logits = self.mask_fcn_logits(mask_feat)

        loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks,
                                  tgt_weights)
        return {'loss_mask': loss_mask}

    def forward_test(self,
                     body_feats,
                     rois,
                     rois_num,
                     scale_factor,
                     feat_func=None):
        """
        body_feats (list[Tensor]): Multi-level backbone features
        rois (Tensor): Prediction from bbox head with shape [N, 6]
        rois_num (Tensor): The number of prediction for each batch
        scale_factor (Tensor): The scale factor from origin size to input size
        """
        if not self.export_onnx and rois.shape[0] == 0:
            mask_out = paddle.full([1, 1, 1], -1)
        else:
            bbox = [rois[:, 2:]]
            labels = rois[:, 0].cast('int32')
            rois_feat = self.roi_extractor(body_feats, bbox, rois_num)
            if self.share_bbox_feat:
                assert feat_func is not None
                rois_feat = feat_func(rois_feat)

            mask_feat = self.head(rois_feat)
            mask_logit = self.mask_fcn_logits(mask_feat)
            if self.num_classes == 1:
                mask_out = F.sigmoid(mask_logit)[:, 0, :, :]
            else:
                num_masks = mask_logit.shape[0]
                index = paddle.arange(num_masks).cast('int32')
                mask_out = mask_logit[index, labels]
                mask_out_shape = mask_out.shape
                mask_out = paddle.reshape(mask_out, 
                    index.shape + [mask_out_shape[-2]] + [mask_out_shape[-1]])
                mask_out = F.sigmoid(mask_out)
        return mask_out

    def forward(self,
                body_feats,
                rois,
                rois_num,
                inputs,
                targets=None,
                bbox_feat=None,
                feat_func=None):
        if self.training:
            return self.forward_train(body_feats, rois, rois_num, inputs,
                                      targets, bbox_feat)
        else:
            im_scale = inputs['scale_factor']
            return self.forward_test(body_feats, rois, rois_num, im_scale,
                                     feat_func)


================================================
FILE: ppdet/modeling/heads/petr_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py
"""
import copy
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import paddle.distributed as dist

from ..transformers.petr_transformer import inverse_sigmoid, masked_fill
from ..initializer import constant_, normal_

__all__ = ["PETRHead"]

from functools import partial


def bias_init_with_prob(prior_prob: float) -> float:
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init


def multi_apply(func, *args, **kwargs):
    """Apply function to a list of arguments.

    Note:
        This function applies the ``func`` to multiple inputs and
        map the multiple outputs of the ``func`` into different
        list. Each list contains the same type of outputs corresponding
        to different inputs.

    Args:
        func (Function): A function that will be applied to a list of
            arguments

    Returns:
        tuple(list): A tuple containing multiple list, each list contains \
            a kind of returned results by the function
    """
    pfunc = partial(func, **kwargs) if kwargs else func
    map_results = map(pfunc, *args)
    res = tuple(map(list, zip(*map_results)))
    return res


def reduce_mean(tensor):
    """"Obtain the mean of tensor on different GPUs."""
    if not (dist.get_world_size() and dist.is_initialized()):
        return tensor
    tensor = tensor.clone()
    dist.all_reduce(
        tensor.divide(
            paddle.to_tensor(
                dist.get_world_size(), dtype='float32')),
        op=dist.ReduceOp.SUM)
    return tensor


def gaussian_radius(det_size, min_overlap=0.7):
    """calculate gaussian radius according to object size.
    """
    height, width = det_size

    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1)
    r1 = (b1 + sq1) / 2

    a2 = 4
    b2 = 2 * (height + width)
    c2 = (1 - min_overlap) * width * height
    sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2)
    r2 = (b2 + sq2) / 2

    a3 = 4 * min_overlap
    b3 = -2 * min_overlap * (height + width)
    c3 = (min_overlap - 1) * width * height
    sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)


def gaussian2D(shape, sigma=1):
    m, n = [(ss - 1.) / 2. for ss in shape]
    y = paddle.arange(-m, m + 1, dtype="float32")[:, None]
    x = paddle.arange(-n, n + 1, dtype="float32")[None, :]
    # y, x = np.ogrid[-m:m + 1, -n:n + 1]

    h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma))
    h[h < np.finfo(np.float32).eps * h.max()] = 0
    return h


def draw_umich_gaussian(heatmap, center, radius, k=1):
    diameter = 2 * radius + 1
    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
    gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype)

    x, y = int(center[0]), int(center[1])
    radius = int(radius)

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
                               radius + right]
    # assert masked_gaussian.equal(1).float().sum() == 1
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum(
            masked_heatmap, masked_gaussian * k)
    return heatmap


@register
class PETRHead(nn.Layer):
    """Head of `End-to-End Multi-Person Pose Estimation with Transformers`.

    Args:
        num_classes (int): Number of categories excluding the background.
        in_channels (int): Number of channels in the input feature map.
        num_query (int): Number of query in Transformer.
        num_kpt_fcs (int, optional): Number of fully-connected layers used in
            `FFN`, which is then used for the keypoint regression head.
            Default 2.
        transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for
            building the Encoder and Decoder. Default: None.
        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
            all ranks. Default to False.
        positional_encoding (obj:`mmcv.ConfigDict`|dict):
            Config for position encoding.
        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
            classification loss. Default `CrossEntropyLoss`.
        loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the
            regression loss. Default `L1Loss`.
        loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the
            regression oks loss. Default `OKSLoss`.
        loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the
            regression heatmap loss. Default `NegLoss`.
        as_two_stage (bool) : Whether to generate the proposal from
            the outputs of encoder.
        with_kpt_refine (bool): Whether to refine the reference points
            in the decoder. Defaults to True.
        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
            transformer head.
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    """
    __inject__ = [
        "transformer", "positional_encoding", "assigner", "sampler", "loss_cls",
        "loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine",
        "loss_oks_refine"
    ]

    def __init__(self,
                 num_classes,
                 in_channels,
                 num_query=100,
                 num_kpt_fcs=2,
                 num_keypoints=17,
                 transformer=None,
                 sync_cls_avg_factor=True,
                 positional_encoding='SinePositionalEncoding',
                 loss_cls='FocalLoss',
                 loss_kpt='L1Loss',
                 loss_oks='OKSLoss',
                 loss_hm='CenterFocalLoss',
                 with_kpt_refine=True,
                 assigner='PoseHungarianAssigner',
                 sampler='PseudoSampler',
                 loss_kpt_rpn='L1Loss',
                 loss_kpt_refine='L1Loss',
                 loss_oks_refine='opera.OKSLoss',
                 test_cfg=dict(max_per_img=100),
                 init_cfg=None,
                 **kwargs):
        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
        # since it brings inconvenience when the initialization of
        # `AnchorFreeHead` is called.
        super().__init__()
        self.bg_cls_weight = 0
        self.sync_cls_avg_factor = sync_cls_avg_factor
        self.assigner = assigner
        self.sampler = sampler
        self.num_query = num_query
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.num_kpt_fcs = num_kpt_fcs
        self.test_cfg = test_cfg
        self.fp16_enabled = False
        self.as_two_stage = transformer.as_two_stage
        self.with_kpt_refine = with_kpt_refine
        self.num_keypoints = num_keypoints
        self.loss_cls = loss_cls
        self.loss_kpt = loss_kpt
        self.loss_kpt_rpn = loss_kpt_rpn
        self.loss_kpt_refine = loss_kpt_refine
        self.loss_oks = loss_oks
        self.loss_oks_refine = loss_oks_refine
        self.loss_hm = loss_hm
        if self.loss_cls.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1
        self.positional_encoding = positional_encoding
        self.transformer = transformer
        self.embed_dims = self.transformer.embed_dims
        # assert 'num_feats' in positional_encoding
        num_feats = positional_encoding.num_pos_feats
        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
            f' and {num_feats}.'
        self._init_layers()
        self.init_weights()

    def _init_layers(self):
        """Initialize classification branch and keypoint branch of head."""

        fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels)

        kpt_branch = []
        kpt_branch.append(nn.Linear(self.embed_dims, 512))
        kpt_branch.append(nn.ReLU())
        for _ in range(self.num_kpt_fcs):
            kpt_branch.append(nn.Linear(512, 512))
            kpt_branch.append(nn.ReLU())
        kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints))
        kpt_branch = nn.Sequential(*kpt_branch)

        def _get_clones(module, N):
            return nn.LayerList([copy.deepcopy(module) for i in range(N)])

        # last kpt_branch is used to generate proposal from
        # encode feature map when as_two_stage is True.
        num_pred = (self.transformer.decoder.num_layers + 1) if \
            self.as_two_stage else self.transformer.decoder.num_layers

        if self.with_kpt_refine:
            self.cls_branches = _get_clones(fc_cls, num_pred)
            self.kpt_branches = _get_clones(kpt_branch, num_pred)
        else:
            self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)])
            self.kpt_branches = nn.LayerList(
                [kpt_branch for _ in range(num_pred)])

        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)

        refine_kpt_branch = []
        for _ in range(self.num_kpt_fcs):
            refine_kpt_branch.append(
                nn.Linear(self.embed_dims, self.embed_dims))
            refine_kpt_branch.append(nn.ReLU())
        refine_kpt_branch.append(nn.Linear(self.embed_dims, 2))
        refine_kpt_branch = nn.Sequential(*refine_kpt_branch)
        if self.with_kpt_refine:
            num_pred = self.transformer.refine_decoder.num_layers
            self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred)
        self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints)

    def init_weights(self):
        """Initialize weights of the PETR head."""
        self.transformer.init_weights()
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            for m in self.cls_branches:
                constant_(m.bias, bias_init)
        for m in self.kpt_branches:
            constant_(m[-1].bias, 0)
        # initialization of keypoint refinement branch
        if self.with_kpt_refine:
            for m in self.refine_kpt_branches:
                constant_(m[-1].bias, 0)
        # initialize bias for heatmap prediction
        bias_init = bias_init_with_prob(0.1)
        normal_(self.fc_hm.weight, std=0.01)
        constant_(self.fc_hm.bias, bias_init)

    def forward(self, mlvl_feats, img_metas):
        """Forward function.

        Args:
            mlvl_feats (tuple[Tensor]): Features from the upstream
                network, each is a 4D-tensor with shape
                (N, C, H, W).
            img_metas (list[dict]): List of image information.

        Returns:
            outputs_classes (Tensor): Outputs from the classification head,
                shape [nb_dec, bs, num_query, cls_out_channels]. Note
                cls_out_channels should include background.
            outputs_kpts (Tensor): Sigmoid outputs from the regression
                head with normalized coordinate format (cx, cy, w, h).
                Shape [nb_dec, bs, num_query, K*2].
            enc_outputs_class (Tensor): The score of each point on encode
                feature map, has shape (N, h*w, num_class). Only when
                as_two_stage is Ture it would be returned, otherwise
                `None` would be returned.
            enc_outputs_kpt (Tensor): The proposal generate from the
                encode feature map, has shape (N, h*w, K*2). Only when
                as_two_stage is Ture it would be returned, otherwise
                `None` would be returned.
        """

        batch_size = mlvl_feats[0].shape[0]
        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
        img_masks = paddle.zeros(
            (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype)
        for img_id in range(batch_size):
            img_h, img_w, _ = img_metas[img_id]['img_shape']
            img_masks[img_id, :img_h, :img_w] = 1

        mlvl_masks = []
        mlvl_positional_encodings = []
        for feat in mlvl_feats:
            mlvl_masks.append(
                F.interpolate(
                    img_masks[None], size=feat.shape[-2:]).squeeze(0))
            mlvl_positional_encodings.append(
                self.positional_encoding(mlvl_masks[-1]).transpose(
                    [0, 3, 1, 2]))

        query_embeds = self.query_embedding.weight
        hs, init_reference, inter_references, \
            enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \
                self.transformer(
                    mlvl_feats,
                    mlvl_masks,
                    query_embeds,
                    mlvl_positional_encodings,
                    kpt_branches=self.kpt_branches \
                        if self.with_kpt_refine else None,  # noqa:E501
                    cls_branches=self.cls_branches \
                        if self.as_two_stage else None  # noqa:E501
            )

        outputs_classes = []
        outputs_kpts = []

        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.cls_branches[lvl](hs[lvl])
            tmp_kpt = self.kpt_branches[lvl](hs[lvl])
            assert reference.shape[-1] == self.num_keypoints * 2
            tmp_kpt += reference
            outputs_kpt = F.sigmoid(tmp_kpt)
            outputs_classes.append(outputs_class)
            outputs_kpts.append(outputs_kpt)

        outputs_classes = paddle.stack(outputs_classes)
        outputs_kpts = paddle.stack(outputs_kpts)

        if hm_proto is not None:
            # get heatmap prediction (training phase)
            hm_memory, hm_mask = hm_proto
            hm_pred = self.fc_hm(hm_memory)
            hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask)

        if self.as_two_stage:
            return outputs_classes, outputs_kpts, \
                enc_outputs_class, F.sigmoid(enc_outputs_kpt), \
                hm_proto, memory, mlvl_masks
        else:
            raise RuntimeError('only "as_two_stage=True" is supported.')

    def forward_refine(self, memory, mlvl_masks, refine_targets, losses,
                       img_metas):
        """Forward function.

        Args:
            mlvl_masks (tuple[Tensor]): The key_padding_mask from
                different level used for encoder and decoder,
                each is a 3D-tensor with shape (bs, H, W).
            losses (dict[str, Tensor]): A dictionary of loss components.
            img_metas (list[dict]): List of image information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets
        pos_inds = kpt_weights.sum(-1) > 0
        if not pos_inds.any():
            pos_kpt_preds = paddle.zeros_like(kpt_preds[:1])
            pos_img_inds = paddle.zeros([1], dtype="int64")
        else:
            pos_kpt_preds = kpt_preds[pos_inds]
            pos_img_inds = (pos_inds.nonzero() /
                            self.num_query).squeeze(1).astype("int64")
        hs, init_reference, inter_references = self.transformer.forward_refine(
            mlvl_masks,
            memory,
            pos_kpt_preds.detach(),
            pos_img_inds,
            kpt_branches=self.refine_kpt_branches
            if self.with_kpt_refine else None,  # noqa:E501
        )

        outputs_kpts = []

        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl])
            assert reference.shape[-1] == 2
            tmp_kpt += reference
            outputs_kpt = F.sigmoid(tmp_kpt)
            outputs_kpts.append(outputs_kpt)
        outputs_kpts = paddle.stack(outputs_kpts)

        if not self.training:
            return outputs_kpts

        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        num_total_pos = paddle.to_tensor(
            [outputs_kpts.shape[1]], dtype=kpt_weights.dtype)
        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        if not pos_inds.any():
            for i, kpt_refine_preds in enumerate(outputs_kpts):
                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
                losses[f'd{i}.loss_oks_refine'] = loss_oks
                continue
            return losses

        batch_size = mlvl_masks[0].shape[0]
        factors = []
        for img_id in range(batch_size):
            img_h, img_w, _ = img_metas[img_id]['img_shape']
            factor = paddle.to_tensor(
                [img_w, img_h, img_w, img_h],
                dtype="float32").squeeze(-1).unsqueeze(0).tile(
                    (self.num_query, 1))
            factors.append(factor)
        factors = paddle.concat(factors, 0)
        factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2))

        pos_kpt_weights = kpt_weights[pos_inds]
        pos_kpt_targets = kpt_targets[pos_inds]
        pos_kpt_targets_scaled = pos_kpt_targets * factors
        pos_areas = area_targets[pos_inds]
        pos_valid = kpt_weights[pos_inds][:, 0::2]
        for i, kpt_refine_preds in enumerate(outputs_kpts):
            if not pos_inds.any():
                print("refine kpt and oks skip")
                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
                losses[f'd{i}.loss_oks_refine'] = loss_oks
                continue

            # kpt L1 Loss
            pos_refine_preds = kpt_refine_preds.reshape(
                (kpt_refine_preds.shape[0], -1))
            loss_kpt = self.loss_kpt_refine(
                pos_refine_preds,
                pos_kpt_targets,
                pos_kpt_weights,
                avg_factor=num_valid_kpt)
            losses[f'd{i}.loss_kpt_refine'] = loss_kpt
            # kpt oks loss
            pos_refine_preds_scaled = pos_refine_preds * factors
            assert (pos_areas > 0).all()
            loss_oks = self.loss_oks_refine(
                pos_refine_preds_scaled,
                pos_kpt_targets_scaled,
                pos_valid,
                pos_areas,
                avg_factor=num_total_pos)
            losses[f'd{i}.loss_oks_refine'] = loss_oks
        return losses

    # over-write because img_metas are needed as inputs for bbox_head.
    def forward_train(self,
                      x,
                      img_metas,
                      gt_bboxes,
                      gt_labels=None,
                      gt_keypoints=None,
                      gt_areas=None,
                      gt_bboxes_ignore=None,
                      proposal_cfg=None,
                      **kwargs):
        """Forward function for training mode.

        Args:
            x (list[Tensor]): Features from backbone.
            img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
                shape (num_gts, 4).
            gt_labels (list[Tensor]): Ground truth labels of each box,
                shape (num_gts,).
            gt_keypoints (list[Tensor]): Ground truth keypoints of the image,
                shape (num_gts, K*3).
            gt_areas (list[Tensor]): Ground truth mask areas of each box,
                shape (num_gts,).
            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
                ignored, shape (num_ignored_gts, 4).
            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
                if None, test_cfg would be used.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        assert proposal_cfg is None, '"proposal_cfg" must be None'
        outs = self(x, img_metas)
        memory, mlvl_masks = outs[-2:]
        outs = outs[:-2]
        if gt_labels is None:
            loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas)
        else:
            loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas,
                                  img_metas)
        losses_and_targets = self.loss(
            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
        # losses = losses_and_targets
        losses, refine_targets = losses_and_targets
        # get pose refinement loss
        losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses,
                                     img_metas)
        return losses

    def loss(self,
             all_cls_scores,
             all_kpt_preds,
             enc_cls_scores,
             enc_kpt_preds,
             enc_hm_proto,
             gt_bboxes_list,
             gt_labels_list,
             gt_keypoints_list,
             gt_areas_list,
             img_metas,
             gt_bboxes_ignore=None):
        """Loss function.

        Args:
            all_cls_scores (Tensor): Classification score of all
                decoder layers, has shape
                [nb_dec, bs, num_query, cls_out_channels].
            all_kpt_preds (Tensor): Sigmoid regression
                outputs of all decode layers. Each is a 4D-tensor with
                normalized coordinate format (x_{i}, y_{i}) and shape
                [nb_dec, bs, num_query, K*2].
            enc_cls_scores (Tensor): Classification scores of
                points on encode feature map, has shape
                (N, h*w, num_classes). Only be passed when as_two_stage is
                True, otherwise is None.
            enc_kpt_preds (Tensor): Regression results of each points
                on the encode feature map, has shape (N, h*w, K*2). Only be
                passed when as_two_stage is True, otherwise is None.
            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                    ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.
            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
                which can be ignored for each image. Default None.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        assert gt_bboxes_ignore is None, \
            f'{self.__class__.__name__} only supports ' \
            f'for gt_bboxes_ignore setting to None.'

        num_dec_layers = len(all_cls_scores)
        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
        all_gt_keypoints_list = [
            gt_keypoints_list for _ in range(num_dec_layers)
        ]
        all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)]
        img_metas_list = [img_metas for _ in range(num_dec_layers)]

        losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \
            area_targets_list, kpt_weights_list = multi_apply(
                self.loss_single, all_cls_scores, all_kpt_preds,
                all_gt_labels_list, all_gt_keypoints_list,
                all_gt_areas_list, img_metas_list)

        loss_dict = dict()
        # loss of proposal generated from encode feature map.
        if enc_cls_scores is not None:
            binary_labels_list = [
                paddle.zeros_like(gt_labels_list[i])
                for i in range(len(img_metas))
            ]
            enc_loss_cls, enc_losses_kpt = \
                self.loss_single_rpn(
                    enc_cls_scores, enc_kpt_preds, binary_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas)
            loss_dict['enc_loss_cls'] = enc_loss_cls
            loss_dict['enc_loss_kpt'] = enc_losses_kpt

        # loss from the last decoder layer
        loss_dict['loss_cls'] = losses_cls[-1]
        loss_dict['loss_kpt'] = losses_kpt[-1]
        loss_dict['loss_oks'] = losses_oks[-1]
        # loss from other decoder layers
        num_dec_layer = 0
        for loss_cls_i, loss_kpt_i, loss_oks_i in zip(
                losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]):
            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
            loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i
            loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i
            num_dec_layer += 1

        # losses of heatmap generated from P3 feature map
        hm_pred, hm_mask = enc_hm_proto
        loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list,
                                    gt_labels_list, gt_bboxes_list)
        loss_dict['loss_hm'] = loss_hm

        return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1],
                           area_targets_list[-1], kpt_weights_list[-1])

    def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels,
                     gt_bboxes):
        assert hm_pred.shape[-2:] == hm_mask.shape[-2:]
        num_img, _, h, w = hm_pred.shape
        # placeholder of heatmap target (Gaussian distribution)
        hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype)
        for i, (gt_label, gt_bbox, gt_keypoint
                ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)):
            if gt_label.shape[0] == 0:
                continue
            gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1,
                                               3)).clone()
            gt_keypoint[..., :2] /= 8

            assert gt_keypoint[..., 0].max() <= w + 0.5  # new coordinate system
            assert gt_keypoint[..., 1].max() <= h + 0.5  # new coordinate system
            gt_bbox /= 8
            gt_w = gt_bbox[:, 2] - gt_bbox[:, 0]
            gt_h = gt_bbox[:, 3] - gt_bbox[:, 1]
            for j in range(gt_label.shape[0]):
                # get heatmap radius
                kp_radius = paddle.clip(
                    paddle.floor(
                        gaussian_radius(
                            (gt_h[j], gt_w[j]), min_overlap=0.9)),
                    min=0,
                    max=3)
                for k in range(self.num_keypoints):
                    if gt_keypoint[j, k, 2] > 0:
                        gt_kp = gt_keypoint[j, k, :2]
                        gt_kp_int = paddle.floor(gt_kp)
                        hm_target[i, k] = draw_umich_gaussian(
                            hm_target[i, k], gt_kp_int, kp_radius)
        # compute heatmap loss
        hm_pred = paddle.clip(
            F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4)  # refer to CenterNet
        loss_hm = self.loss_hm(
            hm_pred,
            hm_target.detach(),
            mask=~hm_mask.astype("bool").unsqueeze(1))
        return loss_hm

    def loss_single(self, cls_scores, kpt_preds, gt_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas):
        """Loss function for outputs from a single decoder layer of a single
        feature level.

        Args:
            cls_scores (Tensor): Box score logits from a single decoder layer
                for all images. Shape [bs, num_query, cls_out_channels].
            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
                for all images, with normalized coordinate (x_{i}, y_{i}) and
                shape [bs, num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components for outputs from
                a single decoder layer.
        """
        num_imgs = cls_scores.shape[0]
        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
                                           gt_labels_list, gt_keypoints_list,
                                           gt_areas_list, img_metas)
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
        labels = paddle.concat(labels_list, 0)
        label_weights = paddle.concat(label_weights_list, 0)
        kpt_targets = paddle.concat(kpt_targets_list, 0)
        kpt_weights = paddle.concat(kpt_weights_list, 0)
        area_targets = paddle.concat(area_targets_list, 0)

        # classification loss
        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * self.bg_cls_weight
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                paddle.to_tensor(
                    [cls_avg_factor], dtype=cls_scores.dtype))
        cls_avg_factor = max(cls_avg_factor, 1)

        loss_cls = self.loss_cls(
            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

        # Compute the average number of gt keypoints accross all gpus, for
        # normalization purposes
        num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype)
        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        # construct factors used for rescale keypoints
        factors = []
        for img_meta, kpt_pred in zip(img_metas, kpt_preds):
            img_h, img_w, _ = img_meta['img_shape']
            factor = paddle.to_tensor(
                [img_w, img_h, img_w, img_h],
                dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile(
                    (kpt_pred.shape[0], 1))
            factors.append(factor)
        factors = paddle.concat(factors, 0)

        # keypoint regression loss
        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        # assert num_valid_kpt == (kpt_targets>0).sum().item()
        loss_kpt = self.loss_kpt(
            kpt_preds,
            kpt_targets.detach(),
            kpt_weights.detach(),
            avg_factor=num_valid_kpt)

        # keypoint oks loss
        pos_inds = kpt_weights.sum(-1) > 0
        if not pos_inds.any():
            loss_oks = kpt_preds.sum() * 0
        else:
            factors = factors[pos_inds][:, :2].tile((
                (1, kpt_preds.shape[-1] // 2)))
            pos_kpt_preds = kpt_preds[pos_inds] * factors
            pos_kpt_targets = kpt_targets[pos_inds] * factors
            pos_areas = area_targets[pos_inds]
            pos_valid = kpt_weights[pos_inds][..., 0::2]
            assert (pos_areas > 0).all()
            loss_oks = self.loss_oks(
                pos_kpt_preds,
                pos_kpt_targets,
                pos_valid,
                pos_areas,
                avg_factor=num_total_pos)
        return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \
            area_targets, kpt_weights

    def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas):
        """Compute regression and classification targets for a batch image.

        Outputs from a single decoder layer of a single feature level are used.

        Args:
            cls_scores_list (list[Tensor]): Box score logits from a single
                decoder layer for each image with shape [num_query,
                cls_out_channels].
            kpt_preds_list (list[Tensor]): Sigmoid outputs from a single
                decoder layer for each image, with normalized coordinate
                (x_{i}, y_{i}) and shape [num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3).
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            tuple: a tuple containing the following targets.

                - labels_list (list[Tensor]): Labels for all images.
                - label_weights_list (list[Tensor]): Label weights for all
                    images.
                - kpt_targets_list (list[Tensor]): Keypoint targets for all
                    images.
                - kpt_weights_list (list[Tensor]): Keypoint weights for all
                    images.
                - area_targets_list (list[Tensor]): area targets for all
                    images.
                - num_total_pos (int): Number of positive samples in all
                    images.
                - num_total_neg (int): Number of negative samples in all
                    images.
        """
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, pos_inds_list, neg_inds_list) = multi_apply(
             self._get_target_single, cls_scores_list, kpt_preds_list,
             gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas)
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
        return (labels_list, label_weights_list, kpt_targets_list,
                kpt_weights_list, area_targets_list, num_total_pos,
                num_total_neg)

    def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints,
                           gt_areas, img_meta):
        """Compute regression and classification targets for one image.

        Outputs from a single decoder layer of a single feature level are used.

        Args:
            cls_score (Tensor): Box score logits from a single decoder layer
                for one image. Shape [num_query, cls_out_channels].
            kpt_pred (Tensor): Sigmoid outputs from a single decoder layer
                for one image, with normalized coordinate (x_{i}, y_{i}) and
                shape [num_query, K*2].
            gt_labels (Tensor): Ground truth class indices for one image
                with shape (num_gts, ).
            gt_keypoints (Tensor): Ground truth keypoints for one image with
                shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
                    p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas (Tensor): Ground truth mask areas for one image
                with shape (num_gts, ).
            img_meta (dict): Meta information for one image.

        Returns:
            tuple[Tensor]: a tuple containing the following for one image.

                - labels (Tensor): Labels of each image.
                - label_weights (Tensor): Label weights of each image.
                - kpt_targets (Tensor): Keypoint targets of each image.
                - kpt_weights (Tensor): Keypoint weights of each image.
                - area_targets (Tensor): Area targets of each image.
                - pos_inds (Tensor): Sampled positive indices for each image.
                - neg_inds (Tensor): Sampled negative indices for each image.
        """
        num_bboxes = kpt_pred.shape[0]
        # assigner and sampler
        assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels,
                                             gt_keypoints, gt_areas, img_meta)
        sampling_result = self.sampler.sample(assign_result, kpt_pred,
                                              gt_keypoints)

        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds

        # label targets
        labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64")
        label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype)
        kpt_targets = paddle.zeros_like(kpt_pred)
        kpt_weights = paddle.zeros_like(kpt_pred)
        area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype)

        if pos_inds.size == 0:
            return (labels, label_weights, kpt_targets, kpt_weights,
                    area_targets, pos_inds, neg_inds)

        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][
            ..., 0].astype("int64")

        img_h, img_w, _ = img_meta['img_shape']
        # keypoint targets
        pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds]
        pos_gt_kpts = pos_gt_kpts.reshape(
            (len(sampling_result.pos_assigned_gt_inds), -1, 3))
        valid_idx = pos_gt_kpts[:, :, 2] > 0
        pos_kpt_weights = kpt_weights[pos_inds].reshape(
            (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2))
        # pos_kpt_weights[valid_idx][...] = 1.0
        pos_kpt_weights = masked_fill(pos_kpt_weights,
                                      valid_idx.unsqueeze(-1), 1.0)
        kpt_weights[pos_inds] = pos_kpt_weights.reshape(
            (pos_kpt_weights.shape[0], kpt_pred.shape[-1]))

        factor = paddle.to_tensor(
            [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0)
        pos_gt_kpts_normalized = pos_gt_kpts[..., :2]
        pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \
            factor[:, 0:1]
        pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \
            factor[:, 1:2]
        kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape(
            (pos_gt_kpts.shape[0], kpt_pred.shape[-1]))

        pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0]
        area_targets[pos_inds] = pos_gt_areas

        return (labels, label_weights, kpt_targets, kpt_weights, area_targets,
                pos_inds, neg_inds)

    def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list,
                        gt_keypoints_list, gt_areas_list, img_metas):
        """Loss function for outputs from a single decoder layer of a single
        feature level.

        Args:
            cls_scores (Tensor): Box score logits from a single decoder layer
                for all images. Shape [bs, num_query, cls_out_channels].
            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
                for all images, with normalized coordinate (x_{i}, y_{i}) and
                shape [bs, num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components for outputs from
                a single decoder layer.
        """
        num_imgs = cls_scores.shape[0]
        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
                                           gt_labels_list, gt_keypoints_list,
                                           gt_areas_list, img_metas)
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
        labels = paddle.concat(labels_list, 0)
        label_weights = paddle.concat(label_weights_list, 0)
        kpt_targets = paddle.concat(kpt_targets_list, 0)
        kpt_weights = paddle.concat(kpt_weights_list, 0)

        # classification loss
        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * self.bg_cls_weight
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                paddle.to_tensor(
                    [cls_avg_factor], dtype=cls_scores.dtype))
        cls_avg_factor = max(cls_avg_factor, 1)

        cls_avg_factor = max(cls_avg_factor, 1)
        loss_cls = self.loss_cls(
            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

        # Compute the average number of gt keypoints accross all gpus, for
        # normalization purposes
        # num_total_pos = loss_cls.to_tensor([num_total_pos])
        # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        # keypoint regression loss
        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        # assert num_valid_kpt == (kpt_targets>0).sum().item()
        loss_kpt = self.loss_kpt_rpn(
            kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt)

        return loss_cls, loss_kpt

    def get_bboxes(self,
                   all_cls_scores,
                   all_kpt_preds,
                   enc_cls_scores,
                   enc_kpt_preds,
                   hm_proto,
                   memory,
                   mlvl_masks,
                   img_metas,
                   rescale=False):
        """Transform network outputs for a batch into bbox predictions.

        Args:
            all_cls_scores (Tensor): Classification score of all
                decoder layers, has shape
                [nb_dec, bs, num_query, cls_out_channels].
            all_kpt_preds (Tensor): Sigmoid regression
                outputs of all decode layers. Each is a 4D-tensor with
                normalized coordinate format (x_{i}, y_{i}) and shape
                [nb_dec, bs, num_query, K*2].
            enc_cls_scores (Tensor): Classification scores of points on
                encode feature map, has shape (N, h*w, num_classes).
                Only be passed when as_two_stage is True, otherwise is None.
            enc_kpt_preds (Tensor): Regression results of each points
                on the encode feature map, has shape (N, h*w, K*2). Only be
                passed when as_two_stage is True, otherwise is None.
            img_metas (list[dict]): Meta information of each image.
            rescale (bool, optional): If True, return boxes in original
                image space. Defalut False.

        Returns:
            list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple.
                The first item is an (n, 5) tensor, where the first 4 columns
                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
                5-th column is a score between 0 and 1. The second item is a
                (n,) tensor where each item is the predicted class label of
                the corresponding box. The third item is an (n, K, 3) tensor
                with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y,
                p^{K}_v] format.
        """
        cls_scores = all_cls_scores[-1]
        kpt_preds = all_kpt_preds[-1]

        result_list = []
        for img_id in range(len(img_metas)):
            cls_score = cls_scores[img_id]
            kpt_pred = kpt_preds[img_id]
            img_shape = img_metas[img_id]['img_shape']
            scale_factor = img_metas[img_id]['scale_factor']
            # TODO: only support single image test
            # memory_i = memory[:, img_id, :]
            # mlvl_mask = mlvl_masks[img_id]
            proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape,
                                                scale_factor, memory,
                                                mlvl_masks, rescale)
            result_list.append(proposals)
        return result_list

    def _get_bboxes_single(self,
                           cls_score,
                           kpt_pred,
                           img_shape,
                           scale_factor,
                           memory,
                           mlvl_masks,
                           rescale=False):
        """Transform outputs from the last decoder layer into bbox predictions
        for each image.

        Args:
            cls_score (Tensor): Box score logits from the last decoder layer
                for each image. Shape [num_query, cls_out_channels].
            kpt_pred (Tensor): Sigmoid outputs from the last decoder layer
                for each image, with coordinate format (x_{i}, y_{i}) and
                shape [num_query, K*2].
            img_shape (tuple[int]): Shape of input image, (height, width, 3).
            scale_factor (ndarray, optional): Scale factor of the image arange
                as (w_scale, h_scale, w_scale, h_scale).
            rescale (bool, optional): If True, return boxes in original image
                space. Default False.

        Returns:
            tuple[Tensor]: Results of detected bboxes and labels.

                - det_bboxes: Predicted bboxes with shape [num_query, 5],
                    where the first 4 columns are bounding box positions
                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores
                    between 0 and 1.
                - det_labels: Predicted labels of the corresponding box with
                    shape [num_query].
                - det_kpts: Predicted keypoints with shape [num_query, K, 3].
        """
        assert len(cls_score) == len(kpt_pred)
        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
        # exclude background
        if self.loss_cls.use_sigmoid:
            cls_score = F.sigmoid(cls_score)
            scores, indexs = cls_score.reshape([-1]).topk(max_per_img)
            det_labels = indexs % self.num_classes
            bbox_index = indexs // self.num_classes
            kpt_pred = kpt_pred[bbox_index]
        else:
            scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1)
            scores, bbox_index = scores.topk(max_per_img)
            kpt_pred = kpt_pred[bbox_index]
            det_labels = det_labels[bbox_index]

        # ----- results after pose decoder -----
        # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2))

        # ----- results after joint decoder (default) -----
        # import time
        # start = time.time()
        refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred))
        refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets,
                                             None, None)
        # end = time.time()
        # print(f'refine time: {end - start:.6f}')
        det_kpts = refine_outputs[-1]

        det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1]
        det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0]
        det_kpts[..., 0].clip_(min=0, max=img_shape[1])
        det_kpts[..., 1].clip_(min=0, max=img_shape[0])
        if rescale:
            det_kpts /= paddle.to_tensor(
                scale_factor[:2],
                dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0)

        # use circumscribed rectangle box of keypoints as det bboxes
        x1 = det_kpts[..., 0].min(axis=1, keepdim=True)
        y1 = det_kpts[..., 1].min(axis=1, keepdim=True)
        x2 = det_kpts[..., 0].max(axis=1, keepdim=True)
        y2 = det_kpts[..., 1].max(axis=1, keepdim=True)
        det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
        det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1)

        det_kpts = paddle.concat(
            (det_kpts, paddle.ones(
                det_kpts[..., :1].shape, dtype=det_kpts.dtype)),
            axis=2)

        return det_bboxes, det_labels, det_kpts

    def simple_test(self, feats, img_metas, rescale=False):
        """Test det bboxes without test-time augmentation.

        Args:
            feats (tuple[paddle.Tensor]): Multi-level features from the
                upstream network, each is a 4D-tensor.
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
                3-tuple. The first item is ``bboxes`` with shape (n, 5),
                where 5 represent (tl_x, tl_y, br_x, br_y, score).
                The shape of the second tensor in the tuple is ``labels``
                with shape (n,). The third item is ``kpts`` with shape
                (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y,
                p^{K}_v] format.
        """
        # forward of this head requires img_metas
        outs = self.forward(feats, img_metas)
        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
        return results_list

    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)


================================================
FILE: ppdet/modeling/heads/pico_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant

from ppdet.modeling.ops import get_static_shape
from ..initializer import normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance
from ppdet.core.workspace import register
from ppdet.modeling.layers import ConvNormLayer
from .simota_head import OTAVFLHead
from .gfl_head import Integral, GFLHead
from ppdet.modeling.necks.csp_pan import DPModule

eps = 1e-9

__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat']


def npu_avg_pool2d(feat, w, h):
    batch_size, channels, _, _ = feat.shape
    feat_flat = paddle.reshape(feat, [batch_size, channels, -1])
    feat_mean = paddle.mean(feat_flat, axis=2)
    feat_mean = paddle.reshape(
        feat_mean, [batch_size, channels, w, h])
    return feat_mean

class PicoSE(nn.Layer):
    def __init__(self, feat_channels):
        super(PicoSE, self).__init__()
        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
        self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)

        self._init_weights()

    def _init_weights(self):
        normal_(self.fc.weight, std=0.001)

    def forward(self, feat, avg_feat):
        weight = F.sigmoid(self.fc(avg_feat))
        out = self.conv(feat * weight)
        return out


@register
class PicoFeat(nn.Layer):
    """
    PicoFeat of PicoDet

    Args:
        feat_in (int): The channel number of input Tensor.
        feat_out (int): The channel number of output Tensor.
        num_convs (int): The convolution number of the LiteGFLFeat.
        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
        share_cls_reg (bool): Whether to share the cls and reg output.
        act (str): The act of per layers.
        use_se (bool): Whether to use se module.
    """

    def __init__(self,
                 feat_in=256,
                 feat_out=96,
                 num_fpn_stride=3,
                 num_convs=2,
                 norm_type='bn',
                 share_cls_reg=False,
                 act='hard_swish',
                 use_se=False):
        super(PicoFeat, self).__init__()
        self.num_convs = num_convs
        self.norm_type = norm_type
        self.share_cls_reg = share_cls_reg
        self.act = act
        self.use_se = use_se
        self.cls_convs = []
        self.reg_convs = []

        if paddle.device.get_device().startswith("npu"):
            self.device = "npu"
        else:
            self.device = None
            
        if use_se:
            assert share_cls_reg == True, \
                'In the case of using se, share_cls_reg must be set to True'
            self.se = nn.LayerList()
        for stage_idx in range(num_fpn_stride):
            cls_subnet_convs = []
            reg_subnet_convs = []
            for i in range(self.num_convs):
                in_c = feat_in if i == 0 else feat_out
                cls_conv_dw = self.add_sublayer(
                    'cls_conv_dw{}.{}'.format(stage_idx, i),
                    ConvNormLayer(
                        ch_in=in_c,
                        ch_out=feat_out,
                        filter_size=5,
                        stride=1,
                        groups=feat_out,
                        norm_type=norm_type,
                        bias_on=False,
                        lr_scale=2.))
                cls_subnet_convs.append(cls_conv_dw)
                cls_conv_pw = self.add_sublayer(
                    'cls_conv_pw{}.{}'.format(stage_idx, i),
                    ConvNormLayer(
                        ch_in=in_c,
                        ch_out=feat_out,
                        filter_size=1,
                        stride=1,
                        norm_type=norm_type,
                        bias_on=False,
                        lr_scale=2.))
                cls_subnet_convs.append(cls_conv_pw)

                if not self.share_cls_reg:
                    reg_conv_dw = self.add_sublayer(
                        'reg_conv_dw{}.{}'.format(stage_idx, i),
                        ConvNormLayer(
                            ch_in=in_c,
                            ch_out=feat_out,
                            filter_size=5,
                            stride=1,
                            groups=feat_out,
                            norm_type=norm_type,
                            bias_on=False,
                            lr_scale=2.))
                    reg_subnet_convs.append(reg_conv_dw)
                    reg_conv_pw = self.add_sublayer(
                        'reg_conv_pw{}.{}'.format(stage_idx, i),
                        ConvNormLayer(
                            ch_in=in_c,
                            ch_out=feat_out,
                            filter_size=1,
                            stride=1,
                            norm_type=norm_type,
                            bias_on=False,
                            lr_scale=2.))
                    reg_subnet_convs.append(reg_conv_pw)
            self.cls_convs.append(cls_subnet_convs)
            self.reg_convs.append(reg_subnet_convs)
            if use_se:
                self.se.append(PicoSE(feat_out))

    def act_func(self, x):
        if self.act == "leaky_relu":
            x = F.leaky_relu(x)
        elif self.act == "hard_swish":
            x = F.hardswish(x)
        elif self.act == "relu6":
            x = F.relu6(x)
        return x

    def forward(self, fpn_feat, stage_idx):
        assert stage_idx < len(self.cls_convs)
        cls_feat = fpn_feat
        reg_feat = fpn_feat
        for i in range(len(self.cls_convs[stage_idx])):
            cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))
            reg_feat = cls_feat
            if not self.share_cls_reg:
                reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))
        if self.use_se:
            if self.device == "npu":
                avg_feat = npu_avg_pool2d(cls_feat, 1, 1)
            else:
                avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1))
            se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))
            return cls_feat, se_feat
        return cls_feat, reg_feat


@register
class PicoHead(OTAVFLHead):
    """
    PicoHead
    Args:
        conv_feat (object): Instance of 'PicoFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_class (object): Instance of VariFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        assigner (object): Instance of label assigner.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 7.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'assigner', 'nms'
    ]
    __shared__ = ['num_classes', 'eval_size']

    def __init__(self,
                 conv_feat='PicoFeat',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32],
                 prior_prob=0.01,
                 loss_class='VariFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 assigner='SimOTAAssigner',
                 reg_max=16,
                 feat_in_chan=96,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0,
                 eval_size=None):
        super(PicoHead, self).__init__(
            conv_feat=conv_feat,
            dgqp_module=dgqp_module,
            num_classes=num_classes,
            fpn_stride=fpn_stride,
            prior_prob=prior_prob,
            loss_class=loss_class,
            loss_dfl=loss_dfl,
            loss_bbox=loss_bbox,
            assigner=assigner,
            reg_max=reg_max,
            feat_in_chan=feat_in_chan,
            nms=nms,
            nms_pre=nms_pre,
            cell_offset=cell_offset)
        self.conv_feat = conv_feat
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.loss_vfl = loss_class
        self.loss_dfl = loss_dfl
        self.loss_bbox = loss_bbox
        self.assigner = assigner
        self.reg_max = reg_max
        self.feat_in_chan = feat_in_chan
        self.nms = nms
        self.nms_pre = nms_pre
        self.cell_offset = cell_offset
        self.eval_size = eval_size
        self.device = paddle.device.get_device()

        self.use_sigmoid = self.loss_vfl.use_sigmoid
        if self.use_sigmoid:
            self.cls_out_channels = self.num_classes
        else:
            self.cls_out_channels = self.num_classes + 1
        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
        # Clear the super class initialization
        self.gfl_head_cls = None
        self.gfl_head_reg = None
        self.scales_regs = None

        self.head_cls_list = []
        self.head_reg_list = []
        for i in range(len(fpn_stride)):
            head_cls = self.add_sublayer(
                "head_cls" + str(i),
                nn.Conv2D(
                    in_channels=self.feat_in_chan,
                    out_channels=self.cls_out_channels + 4 * (self.reg_max + 1)
                    if self.conv_feat.share_cls_reg else self.cls_out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    weight_attr=ParamAttr(initializer=Normal(
                        mean=0., std=0.01)),
                    bias_attr=ParamAttr(
                        initializer=Constant(value=bias_init_value))))
            self.head_cls_list.append(head_cls)
            if not self.conv_feat.share_cls_reg:
                head_reg = self.add_sublayer(
                    "head_reg" + str(i),
                    nn.Conv2D(
                        in_channels=self.feat_in_chan,
                        out_channels=4 * (self.reg_max + 1),
                        kernel_size=1,
                        stride=1,
                        padding=0,
                        weight_attr=ParamAttr(initializer=Normal(
                            mean=0., std=0.01)),
                        bias_attr=ParamAttr(initializer=Constant(value=0))))
                self.head_reg_list.append(head_reg)

        # initialize the anchor points
        if self.eval_size:
            self.anchor_points, self.stride_tensor = self._generate_anchors()

    def forward(self, fpn_feats, export_post_process=True):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"

        if self.training:
            return self.forward_train(fpn_feats)
        else:
            return self.forward_eval(
                fpn_feats, export_post_process=export_post_process)

    def forward_train(self, fpn_feats):
        cls_logits_list, bboxes_reg_list = [], []
        for i, fpn_feat in enumerate(fpn_feats):
            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
            if self.conv_feat.share_cls_reg:
                cls_logits = self.head_cls_list[i](conv_cls_feat)
                cls_score, bbox_pred = paddle.split(
                    cls_logits,
                    [self.cls_out_channels, 4 * (self.reg_max + 1)],
                    axis=1)
            else:
                cls_score = self.head_cls_list[i](conv_cls_feat)
                bbox_pred = self.head_reg_list[i](conv_reg_feat)

            if self.dgqp_module:
                quality_score = self.dgqp_module(bbox_pred)
                cls_score = F.sigmoid(cls_score) * quality_score

            cls_logits_list.append(cls_score)
            bboxes_reg_list.append(bbox_pred)

        return (cls_logits_list, bboxes_reg_list)

    def forward_eval(self, fpn_feats, export_post_process=True):
        if self.eval_size:
            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
        cls_logits_list, bboxes_reg_list = [], []
        for i, fpn_feat in enumerate(fpn_feats):
            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
            if self.conv_feat.share_cls_reg:
                cls_logits = self.head_cls_list[i](conv_cls_feat)
                cls_score, bbox_pred = paddle.split(
                    cls_logits,
                    [self.cls_out_channels, 4 * (self.reg_max + 1)],
                    axis=1)
            else:
                cls_score = self.head_cls_list[i](conv_cls_feat)
                bbox_pred = self.head_reg_list[i](conv_reg_feat)

            if self.dgqp_module:
                quality_score = self.dgqp_module(bbox_pred)
                cls_score = F.sigmoid(cls_score) * quality_score

            if not export_post_process:
                # Now only supports batch size = 1 in deploy
                # TODO(ygh): support batch size > 1
                cls_score_out = F.sigmoid(cls_score).reshape(
                    [1, self.cls_out_channels, -1]).transpose([0, 2, 1])
                bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4,
                                               -1]).transpose([0, 2, 1])
            else:
                _, _, h, w = fpn_feat.shape
                l = h * w
                cls_score_out = F.sigmoid(
                    cls_score.reshape([-1, self.cls_out_channels, l]))
                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
                bbox_pred = self.distribution_project(bbox_pred)
                bbox_pred = bbox_pred.reshape([-1, l, 4])

            cls_logits_list.append(cls_score_out)
            bboxes_reg_list.append(bbox_pred)

        if export_post_process:
            cls_logits_list = paddle.concat(cls_logits_list, axis=-1)
            bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1)
            bboxes_reg_list = batch_distance2bbox(anchor_points,
                                                  bboxes_reg_list)
            bboxes_reg_list *= stride_tensor

        return (cls_logits_list, bboxes_reg_list)

    def _generate_anchors(self, feats=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_stride):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = math.ceil(self.eval_size[0] / stride)
                w = math.ceil(self.eval_size[1] / stride)
            shift_x = paddle.arange(end=w) + self.cell_offset
            shift_y = paddle.arange(end=h) + self.cell_offset
            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
            anchor_point = paddle.cast(
                paddle.stack(
                    [shift_x, shift_y], axis=-1), dtype='float32')
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(
                paddle.full(
                    [h * w, 1], stride, dtype='float32'))
        anchor_points = paddle.concat(anchor_points)
        stride_tensor = paddle.concat(stride_tensor)
        return anchor_points, stride_tensor

    def post_process(self,
                     head_outs,
                     scale_factor,
                     export_nms=True,
                     nms_cpu=False):
        pred_scores, pred_bboxes = head_outs
        if not export_nms:
            return pred_bboxes, pred_scores
        else:
            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
            scale_factor = paddle.concat(
                [scale_x, scale_y, scale_x, scale_y],
                axis=-1).reshape([-1, 1, 4])
            # scale bbox to origin image size.
            pred_bboxes /= scale_factor
            if nms_cpu:
                paddle.set_device("cpu")
                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
                paddle.set_device(self.device)
            else:
                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
            return bbox_pred, bbox_num


@register
class PicoHeadV2(GFLHead):
    """
    PicoHeadV2
    Args:
        conv_feat (object): Instance of 'PicoFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_class (object): Instance of VariFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        assigner (object): Instance of label assigner.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 7.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'static_assigner', 'assigner', 'nms'
    ]
    __shared__ = ['num_classes', 'eval_size']

    def __init__(self,
                 conv_feat='PicoFeatV2',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32],
                 prior_prob=0.01,
                 use_align_head=True,
                 loss_class='VariFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 static_assigner_epoch=60,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 reg_max=16,
                 feat_in_chan=96,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0,
                 act='hard_swish',
                 grid_cell_scale=5.0,
                 eval_size=None):
        super(PicoHeadV2, self).__init__(
            conv_feat=conv_feat,
            dgqp_module=dgqp_module,
            num_classes=num_classes,
            fpn_stride=fpn_stride,
            prior_prob=prior_prob,
            loss_class=loss_class,
            loss_dfl=loss_dfl,
            loss_bbox=loss_bbox,
            reg_max=reg_max,
            feat_in_chan=feat_in_chan,
            nms=nms,
            nms_pre=nms_pre,
            cell_offset=cell_offset, )
        self.conv_feat = conv_feat
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.loss_vfl = loss_class
        self.loss_dfl = loss_dfl
        self.loss_bbox = loss_bbox

        self.static_assigner_epoch = static_assigner_epoch
        self.static_assigner = static_assigner
        self.assigner = assigner

        self.reg_max = reg_max
        self.feat_in_chan = feat_in_chan
        self.nms = nms
        self.nms_pre = nms_pre
        self.cell_offset = cell_offset
        self.act = act
        self.grid_cell_scale = grid_cell_scale
        self.use_align_head = use_align_head
        self.cls_out_channels = self.num_classes
        self.eval_size = eval_size

        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
        # Clear the super class initialization
        self.gfl_head_cls = None
        self.gfl_head_reg = None
        self.scales_regs = None

        self.head_cls_list = nn.LayerList()
        self.head_reg_list = nn.LayerList()
        self.cls_align = nn.LayerList()

        for i in range(len(fpn_stride)):
            head_cls = self.add_sublayer(
                "head_cls" + str(i),
                nn.Conv2D(
                    in_channels=self.feat_in_chan,
                    out_channels=self.cls_out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    weight_attr=ParamAttr(initializer=Normal(
                        mean=0., std=0.01)),
                    bias_attr=ParamAttr(
                        initializer=Constant(value=bias_init_value))))
            self.head_cls_list.append(head_cls)
            head_reg = self.add_sublayer(
                "head_reg" + str(i),
                nn.Conv2D(
                    in_channels=self.feat_in_chan,
                    out_channels=4 * (self.reg_max + 1),
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    weight_attr=ParamAttr(initializer=Normal(
                        mean=0., std=0.01)),
                    bias_attr=ParamAttr(initializer=Constant(value=0))))
            self.head_reg_list.append(head_reg)
            if self.use_align_head:
                self.cls_align.append(
                    DPModule(
                        self.feat_in_chan,
                        1,
                        5,
                        act=self.act,
                        use_act_in_out=False))

        # initialize the anchor points
        if self.eval_size:
            self.anchor_points, self.stride_tensor = self._generate_anchors()

    def forward(self, fpn_feats, export_post_process=True):
        assert len(fpn_feats) == len(
            self.fpn_stride
        ), "The size of fpn_feats is not equal to size of fpn_stride"

        if self.training:
            return self.forward_train(fpn_feats)
        else:
            return self.forward_eval(
                fpn_feats, export_post_process=export_post_process)

    def forward_train(self, fpn_feats):
        cls_score_list, reg_list, box_list = [], [], []
        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
            b, _, h, w = get_static_shape(fpn_feat)
            # task decomposition
            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
            cls_logit = self.head_cls_list[i](se_feat)
            reg_pred = self.head_reg_list[i](se_feat)

            # cls prediction and alignment
            if self.use_align_head:
                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
            else:
                cls_score = F.sigmoid(cls_logit)

            cls_score_out = cls_score.transpose([0, 2, 3, 1])
            bbox_pred = reg_pred.transpose([0, 2, 3, 1])
            b, cell_h, cell_w, _ = cls_score_out.shape
            y, x = self.get_single_level_center_point(
                [cell_h, cell_w], stride, cell_offset=self.cell_offset)
            center_points = paddle.stack([x, y], axis=-1)
            cls_score_out = cls_score_out.reshape(
                [b, -1, self.cls_out_channels])
            bbox_pred = self.distribution_project(bbox_pred) * stride
            bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
            bbox_pred = batch_distance2bbox(
                center_points, bbox_pred, max_shapes=None)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1]))
            box_list.append(bbox_pred / stride)

        cls_score_list = paddle.concat(cls_score_list, axis=1)
        box_list = paddle.concat(box_list, axis=1)
        reg_list = paddle.concat(reg_list, axis=1)
        return cls_score_list, reg_list, box_list, fpn_feats

    def forward_eval(self, fpn_feats, export_post_process=True):
        if self.eval_size:
            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
        cls_score_list, box_list = [], []
        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
            _, _, h, w = fpn_feat.shape
            # task decomposition
            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
            cls_logit = self.head_cls_list[i](se_feat)
            reg_pred = self.head_reg_list[i](se_feat)

            # cls prediction and alignment
            if self.use_align_head:
                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
            else:
                cls_score = F.sigmoid(cls_logit)

            if not export_post_process:
                # Now only supports batch size = 1 in deploy
                cls_score_list.append(
                    cls_score.reshape([1, self.cls_out_channels, -1]).transpose(
                        [0, 2, 1]))
                box_list.append(
                    reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose(
                        [0, 2, 1]))
            else:
                l = h * w
                cls_score_out = cls_score.reshape(
                    [-1, self.cls_out_channels, l])
                bbox_pred = reg_pred.transpose([0, 2, 3, 1])
                bbox_pred = self.distribution_project(bbox_pred)
                bbox_pred = bbox_pred.reshape([-1, l, 4])
                cls_score_list.append(cls_score_out)
                box_list.append(bbox_pred)

        if export_post_process:
            cls_score_list = paddle.concat(cls_score_list, axis=-1)
            box_list = paddle.concat(box_list, axis=1)
            box_list = batch_distance2bbox(anchor_points, box_list)
            box_list *= stride_tensor

        return cls_score_list, box_list

    def get_loss(self, head_outs, gt_meta):
        pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs
        gt_labels = gt_meta['gt_class']
        gt_bboxes = gt_meta['gt_bbox']
        gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None
        num_imgs = gt_meta['im_id'].shape[0]
        pad_gt_mask = gt_meta['pad_gt_mask']

        anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell(
            fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset)

        centers = bbox_center(anchors)

        # label assignment
        if gt_meta['epoch_id'] < self.static_assigner_epoch:
            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
                anchors,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes,
                gt_scores=gt_scores,
                pred_bboxes=pred_bboxes.detach() * stride_tensor_list)

        else:
            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
                pred_scores.detach(),
                pred_bboxes.detach() * stride_tensor_list,
                centers,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes,
                gt_scores=gt_scores)

        assigned_bboxes /= stride_tensor_list

        centers_shape = centers.shape
        flatten_centers = centers.expand(
            [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2])
        flatten_strides = stride_tensor_list.expand(
            [num_imgs, centers_shape[0], 1]).reshape([-1, 1])
        flatten_cls_preds = pred_scores.reshape([-1, self.num_classes])
        flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)])
        flatten_bboxes = pred_bboxes.reshape([-1, 4])
        flatten_bbox_targets = assigned_bboxes.reshape([-1, 4])
        flatten_labels = assigned_labels.reshape([-1])
        flatten_assigned_scores = assigned_scores.reshape(
            [-1, self.num_classes])

        pos_inds = paddle.nonzero(
            paddle.logical_and((flatten_labels >= 0),
                               (flatten_labels < self.num_classes)),
            as_tuple=False).squeeze(1)

        num_total_pos = len(pos_inds)

        if num_total_pos > 0:
            pos_bbox_targets = paddle.gather(
                flatten_bbox_targets, pos_inds, axis=0)
            pos_decode_bbox_pred = paddle.gather(
                flatten_bboxes, pos_inds, axis=0)
            pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0)
            pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0)
            pos_centers = paddle.gather(
                flatten_centers, pos_inds, axis=0) / pos_strides

            weight_targets = flatten_assigned_scores.detach()
            weight_targets = paddle.gather(
                weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)

            pred_corners = pos_reg.reshape([-1, self.reg_max + 1])
            target_corners = bbox2distance(pos_centers, pos_bbox_targets,
                                           self.reg_max).reshape([-1])
            # regression loss
            loss_bbox = paddle.sum(
                self.loss_bbox(pos_decode_bbox_pred,
                               pos_bbox_targets) * weight_targets)

            # dfl loss
            loss_dfl = self.loss_dfl(
                pred_corners,
                target_corners,
                weight=weight_targets.expand([-1, 4]).reshape([-1]),
                avg_factor=4.0)
        else:
            loss_bbox = paddle.zeros([])
            loss_dfl = paddle.zeros([])

        avg_factor = flatten_assigned_scores.sum()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(avg_factor)
            avg_factor = paddle.clip(
                avg_factor / paddle.distributed.get_world_size(), min=1)
        loss_vfl = self.loss_vfl(
            flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor)

        loss_bbox = loss_bbox / avg_factor
        loss_dfl = loss_dfl / avg_factor

        loss_states = dict(
            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)

        return loss_states

    def _generate_anchors(self, feats=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_stride):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = math.ceil(self.eval_size[0] / stride)
                w = math.ceil(self.eval_size[1] / stride)
            shift_x = paddle.arange(end=w) + self.cell_offset
            shift_y = paddle.arange(end=h) + self.cell_offset
            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
            anchor_point = paddle.cast(
                paddle.stack(
                    [shift_x, shift_y], axis=-1), dtype='float32')
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(
                paddle.full(
                    [h * w, 1], stride, dtype='float32'))
        anchor_points = paddle.concat(anchor_points)
        stride_tensor = paddle.concat(stride_tensor)
        return anchor_points, stride_tensor

    def post_process(self,
                     head_outs,
                     scale_factor,
                     export_nms=True,
                     nms_cpu=False):
        pred_scores, pred_bboxes = head_outs
        if not export_nms:
            return pred_bboxes, pred_scores
        else:
            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
            scale_factor = paddle.concat(
                [scale_x, scale_y, scale_x, scale_y],
                axis=-1).reshape([-1, 1, 4])
            # scale bbox to origin image size.
            pred_bboxes /= scale_factor
            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
            return bbox_pred, bbox_num


================================================
FILE: ppdet/modeling/heads/ppyoloe_contrast_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

from ..initializer import bias_init_with_prob, constant_
from ..assigners.utils import generate_anchors_for_grid_cell
from ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead

__all__ = ['PPYOLOEContrastHead']


@register
class PPYOLOEContrastHead(PPYOLOEHead):
    __shared__ = [
        'num_classes', 'eval_size', 'trt', 'exclude_nms',
        'exclude_post_process', 'use_shared_conv', 'for_distill'
    ]
    __inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss']

    def __init__(self,
                 in_channels=[1024, 512, 256],
                 num_classes=80,
                 act='swish',
                 fpn_strides=(32, 16, 8),
                 grid_cell_scale=5.0,
                 grid_cell_offset=0.5,
                 reg_max=16,
                 reg_range=None,
                 static_assigner_epoch=4,
                 use_varifocal_loss=True,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 contrast_loss='SupContrast',
                 nms='MultiClassNMS',
                 eval_size=None,
                 loss_weight={
                     'class': 1.0,
                     'iou': 2.5,
                     'dfl': 0.5,
                 },
                 trt=False,
                 attn_conv='convbn',
                 exclude_nms=False,
                 exclude_post_process=False,
                 use_shared_conv=True,
                 for_distill=False):
        super().__init__(in_channels, num_classes, act, fpn_strides,
                         grid_cell_scale, grid_cell_offset, reg_max, reg_range,
                         static_assigner_epoch, use_varifocal_loss,
                         static_assigner, assigner, nms, eval_size, loss_weight,
                         trt, attn_conv, exclude_nms, exclude_post_process,
                         use_shared_conv, for_distill)

        assert len(in_channels) > 0, "len(in_channels) should > 0"
        self.contrast_loss = contrast_loss
        self.contrast_encoder = nn.LayerList()
        for in_c in self.in_channels:
            self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1))
        self._init_contrast_encoder()

    def _init_contrast_encoder(self):
        bias_en = bias_init_with_prob(0.01)
        for en_ in self.contrast_encoder:
            constant_(en_.weight)
            constant_(en_.bias, bias_en)

    def forward_train(self, feats, targets, aux_pred=None):
        anchors, anchor_points, num_anchors_list, stride_tensor = \
            generate_anchors_for_grid_cell(
                feats, self.fpn_strides, self.grid_cell_scale,
                self.grid_cell_offset)

        cls_score_list, reg_distri_list = [], []
        contrast_encoder_list = []
        for i, feat in enumerate(feats):
            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            contrast_logit = self.contrast_encoder[i](self.stem_cls[i](
                feat, avg_feat) + feat)
            contrast_encoder_list.append(
                contrast_logit.flatten(2).transpose([0, 2, 1]))
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
        cls_score_list = paddle.concat(cls_score_list, axis=1)
        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
        contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1)

        return self.get_loss([
            cls_score_list, reg_distri_list, contrast_encoder_list, anchors,
            anchor_points, num_anchors_list, stride_tensor
        ], targets)

    def get_loss(self, head_outs, gt_meta):
        pred_scores, pred_distri, pred_contrast_encoder, anchors,\
        anchor_points, num_anchors_list, stride_tensor = head_outs

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)

        gt_labels = gt_meta['gt_class']
        gt_bboxes = gt_meta['gt_bbox']
        pad_gt_mask = gt_meta['pad_gt_mask']
        # label assignment
        if gt_meta['epoch_id'] < self.static_assigner_epoch:
            assigned_labels, assigned_bboxes, assigned_scores = \
                self.static_assigner(
                    anchors,
                    num_anchors_list,
                    gt_labels,
                    gt_bboxes,
                    pad_gt_mask,
                    bg_index=self.num_classes,
                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
            alpha_l = 0.25
        else:
            if self.sm_use:
                assigned_labels, assigned_bboxes, assigned_scores = \
                    self.assigner(
                    pred_scores.detach(),
                    pred_bboxes.detach() * stride_tensor,
                    anchor_points,
                    stride_tensor,
                    gt_labels,
                    gt_bboxes,
                    pad_gt_mask,
                    bg_index=self.num_classes)
            else:
                assigned_labels, assigned_bboxes, assigned_scores = \
                    self.assigner(
                    pred_scores.detach(),
                    pred_bboxes.detach() * stride_tensor,
                    anchor_points,
                    num_anchors_list,
                    gt_labels,
                    gt_bboxes,
                    pad_gt_mask,
                    bg_index=self.num_classes)
            alpha_l = -1
        # rescale bbox
        assigned_bboxes /= stride_tensor
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = F.one_hot(assigned_labels,
                                      self.num_classes + 1)[..., :-1]
            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
                                            one_hot_label)
        else:
            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)

        assigned_scores_sum = assigned_scores.sum()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(assigned_scores_sum)
            assigned_scores_sum /= paddle.distributed.get_world_size()
        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
        loss_cls /= assigned_scores_sum

        loss_l1, loss_iou, loss_dfl = \
            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
                            assigned_labels, assigned_bboxes, assigned_scores,
                            assigned_scores_sum)
        # contrast loss
        loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \
            assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1]))

        loss = self.loss_weight['class'] * loss_cls + \
               self.loss_weight['iou'] * loss_iou + \
               self.loss_weight['dfl'] * loss_dfl + \
               self.loss_weight['contrast'] * loss_contrast

        out_dict = {
            'loss': loss,
            'loss_cls': loss_cls,
            'loss_iou': loss_iou,
            'loss_dfl': loss_dfl,
            'loss_l1': loss_l1,
            'loss_contrast': loss_contrast
        }
        return out_dict


================================================
FILE: ppdet/modeling/heads/ppyoloe_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
from paddle.nn.initializer import Normal, Constant

from ..bbox_utils import batch_distance2bbox
from ..losses import GIoULoss
from ..initializer import bias_init_with_prob, constant_, normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock
from ppdet.modeling.ops import get_static_shape, get_act_fn
from ppdet.modeling.layers import MultiClassNMS

__all__ = ['PPYOLOEHead', 'SimpleConvHead']


class ESEAttn(nn.Layer):
    def __init__(self, feat_channels, act='swish', attn_conv='convbn'):
        super(ESEAttn, self).__init__()
        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
        if attn_conv == 'convbn':
            self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
        elif attn_conv == 'repvgg':
            self.conv = RepVggBlock(feat_channels, feat_channels, act=act)
        else:
            self.conv = None
        self._init_weights()

    def _init_weights(self):
        normal_(self.fc.weight, std=0.001)

    def forward(self, feat, avg_feat):
        weight = F.sigmoid(self.fc(avg_feat))
        if self.conv:
            return self.conv(feat * weight)
        else:
            return feat * weight


@register
class PPYOLOEHead(nn.Layer):
    __shared__ = [
        'num_classes', 'eval_size', 'trt', 'exclude_nms',
        'exclude_post_process', 'use_shared_conv', 'for_distill'
    ]
    __inject__ = ['static_assigner', 'assigner', 'nms']

    def __init__(self,
                 in_channels=[1024, 512, 256],
                 num_classes=80,
                 act='swish',
                 fpn_strides=(32, 16, 8),
                 grid_cell_scale=5.0,
                 grid_cell_offset=0.5,
                 reg_max=16,
                 reg_range=None,
                 static_assigner_epoch=4,
                 use_varifocal_loss=True,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 nms='MultiClassNMS',
                 eval_size=None,
                 loss_weight={
                     'class': 1.0,
                     'iou': 2.5,
                     'dfl': 0.5,
                 },
                 trt=False,
                 attn_conv='convbn',
                 exclude_nms=False,
                 exclude_post_process=False,
                 use_shared_conv=True,
                 for_distill=False):
        super(PPYOLOEHead, self).__init__()
        assert len(in_channels) > 0, "len(in_channels) should > 0"
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.fpn_strides = fpn_strides
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        if reg_range:
            self.sm_use = True
            self.reg_range = reg_range
        else:
            self.sm_use = False
            self.reg_range = (0, reg_max + 1)
        self.reg_channels = self.reg_range[1] - self.reg_range[0]
        self.iou_loss = GIoULoss()
        self.loss_weight = loss_weight
        self.use_varifocal_loss = use_varifocal_loss
        self.eval_size = eval_size

        self.static_assigner_epoch = static_assigner_epoch
        self.static_assigner = static_assigner
        self.assigner = assigner
        self.nms = nms
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.exclude_nms = exclude_nms
        self.exclude_post_process = exclude_post_process
        self.use_shared_conv = use_shared_conv
        self.for_distill = for_distill
        self.is_teacher = False

        # stem
        self.stem_cls = nn.LayerList()
        self.stem_reg = nn.LayerList()
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
        # pred head
        self.pred_cls = nn.LayerList()
        self.pred_reg = nn.LayerList()
        for in_c in self.in_channels:
            self.pred_cls.append(
                nn.Conv2D(
                    in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(
                nn.Conv2D(
                    in_c, 4 * self.reg_channels, 3, padding=1))
        # projection conv
        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
        self.proj_conv.skip_quant = True
        self._init_weights()

        if self.for_distill:
            self.distill_pairs = {}

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
            constant_(cls_.weight)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.weight)
            constant_(reg_.bias, 1.0)

        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
                               self.reg_channels).reshape(
                                   [1, self.reg_channels, 1, 1])
        self.proj_conv.weight.set_value(proj)
        self.proj_conv.weight.stop_gradient = True
        if self.eval_size:
            anchor_points, stride_tensor = self._generate_anchors()
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def m_avg_pool2d(self, feat, w, h):
        batch_size, channels, _, _ = feat.shape
        feat_flat = paddle.reshape(feat, [batch_size, channels, -1])
        feat_mean = paddle.mean(feat_flat, axis=2)
        feat_mean = paddle.reshape(
            feat_mean, [batch_size, channels, w, h])
        return feat_mean

    def forward_train(self, feats, targets, aux_pred=None):
        anchors, anchor_points, num_anchors_list, stride_tensor = \
            generate_anchors_for_grid_cell(
                feats, self.fpn_strides, self.grid_cell_scale,
                self.grid_cell_offset)

        cls_score_list, reg_distri_list = [], []
        for i, feat in enumerate(feats):
            if (paddle.get_device()[:3]=='npu'):
                avg_feat = self.m_avg_pool2d(feat, 1, 1)
            else:
                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
        cls_score_list = paddle.concat(cls_score_list, axis=1)
        reg_distri_list = paddle.concat(reg_distri_list, axis=1)

        if targets.get('is_teacher', False):
            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
            return cls_score_list, pred_deltas * stride_tensor, pred_dfls

        if targets.get('get_data', False):
            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
            return cls_score_list, pred_deltas * stride_tensor, pred_dfls

        return self.get_loss([
            cls_score_list, reg_distri_list, anchors, anchor_points,
            num_anchors_list, stride_tensor
        ], targets, aux_pred)

    def _generate_anchors(self, feats=None, dtype='float32'):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = paddle.arange(end=w) + self.grid_cell_offset
            shift_y = paddle.arange(end=h) + self.grid_cell_offset
            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
            anchor_point = paddle.cast(
                paddle.stack(
                    [shift_x, shift_y], axis=-1), dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
        anchor_points = paddle.concat(anchor_points)
        stride_tensor = paddle.concat(stride_tensor)
        return anchor_points, stride_tensor

    def forward_eval(self, feats):
        if self.eval_size:
            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points, stride_tensor = self._generate_anchors(feats)
        cls_score_list, reg_dist_list = [], []
        for i, feat in enumerate(feats):
            _, _, h, w = feat.shape
            l = h * w
            if (paddle.device.get_device()[:3]=='npu'):
                avg_feat = self.m_avg_pool2d(feat, 1, 1)
            else:
                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_dist = reg_dist.reshape(
                [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1])
            if self.use_shared_conv:
                reg_dist = self.proj_conv(F.softmax(
                    reg_dist, axis=1)).squeeze(1)
            else:
                reg_dist = F.softmax(reg_dist, axis=1)
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
            reg_dist_list.append(reg_dist)

        cls_score_list = paddle.concat(cls_score_list, axis=-1)
        if self.use_shared_conv:
            reg_dist_list = paddle.concat(reg_dist_list, axis=1)
        else:
            reg_dist_list = paddle.concat(reg_dist_list, axis=2)
            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)

        return cls_score_list, reg_dist_list, anchor_points, stride_tensor

    def forward(self, feats, targets=None, aux_pred=None):
        assert len(feats) == len(self.fpn_strides), \
            "The size of feats is not equal to size of fpn_strides"

        if self.training:
            return self.forward_train(feats, targets, aux_pred)
        else:
            if targets is not None:
                # only for semi-det
                self.is_teacher = targets.get('is_teacher', False)
                if self.is_teacher:
                    return self.forward_train(feats, targets, aux_pred=None)
                else:
                    return self.forward_eval(feats)

            return self.forward_eval(feats)

    @staticmethod
    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
        weight = (score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = F.binary_cross_entropy(
            score, label, weight=weight, reduction='sum')
        return loss

    @staticmethod
    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
        loss = F.binary_cross_entropy(
            pred_score, gt_score, weight=weight, reduction='sum')
        return loss

    def _bbox_decode(self, anchor_points, pred_dist):
        _, l, _ = get_static_shape(pred_dist)
        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)
        return batch_distance2bbox(anchor_points, pred_dist)

    def _bbox_decode_fake(self, pred_dist):
        _, l, _ = get_static_shape(pred_dist)
        pred_dist_dfl = F.softmax(
            pred_dist.reshape([-1, l, 4, self.reg_channels]))
        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2
                                                            ])).squeeze(1)
        return pred_dist, pred_dist_dfl

    def _bbox2distance(self, points, bbox):
        x1y1, x2y2 = paddle.split(bbox, 2, -1)
        lt = points - x1y1
        rb = x2y2 - points
        return paddle.concat([lt, rb], -1).clip(self.reg_range[0],
                                                self.reg_range[1] - 1 - 0.01)

    def _df_loss(self, pred_dist, target, lower_bound=0):
        target_left = paddle.cast(target.floor(), 'int64')
        target_right = target_left + 1
        weight_left = target_right.astype('float32') - target
        weight_right = 1 - weight_left
        loss_left = F.cross_entropy(
            pred_dist, target_left - lower_bound,
            reduction='none') * weight_left
        loss_right = F.cross_entropy(
            pred_dist, target_right - lower_bound,
            reduction='none') * weight_right
        return (loss_left + loss_right).mean(-1, keepdim=True)

    def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
                   assigned_bboxes, assigned_scores, assigned_scores_sum):
        # select positive samples mask
        mask_positive = (assigned_labels != self.num_classes)

        if self.for_distill:
            # only used for LD main_kd distill
            self.distill_pairs['mask_positive_select'] = mask_positive

        num_pos = mask_positive.sum()
        # pos/neg loss
        if num_pos > 0:
            # l1 + iou
            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
                [1, 1, 4]).astype('bool')
            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
                                                   bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = paddle.masked_select(
                assigned_bboxes, bbox_mask).reshape([-1, 4])
            bbox_weight = paddle.masked_select(
                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)

            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)

            loss_iou = self.iou_loss(pred_bboxes_pos,
                                     assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / assigned_scores_sum

            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
                [1, 1, self.reg_channels * 4]).astype('bool')
            pred_dist_pos = paddle.masked_select(
                pred_dist, dist_mask).reshape([-1, 4, self.reg_channels])
            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
            assigned_ltrb_pos = paddle.masked_select(
                assigned_ltrb, bbox_mask).reshape([-1, 4])
            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
                                     self.reg_range[0]) * bbox_weight
            loss_dfl = loss_dfl.sum() / assigned_scores_sum
            if self.for_distill:
                self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos
                self.distill_pairs['pred_dist_pos'] = pred_dist_pos
                self.distill_pairs['bbox_weight'] = bbox_weight
        else:
            loss_l1 = paddle.zeros([])
            loss_iou = paddle.zeros([])
            loss_dfl = pred_dist.sum() * 0.
        return loss_l1, loss_iou, loss_dfl

    def get_loss(self, head_outs, gt_meta, aux_pred=None):
        pred_scores, pred_distri, anchors,\
        anchor_points, num_anchors_list, stride_tensor = head_outs

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)

        if aux_pred is not None:
            pred_scores_aux = aux_pred[0]
            pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1])

        if 'origin_gt_class' in gt_meta:
            gt_labels = gt_meta['origin_gt_class']
            gt_bboxes = gt_meta['origin_gt_bbox']
            pad_gt_mask = gt_meta['pad_origin_gt_mask']
        else:
            gt_labels = gt_meta['gt_class']
            gt_bboxes = gt_meta['gt_bbox']
            pad_gt_mask = gt_meta['pad_gt_mask']
        # label assignment
        if gt_meta['epoch_id'] < self.static_assigner_epoch:
            assigned_labels, assigned_bboxes, assigned_scores = \
                self.static_assigner(
                    anchors,
                    num_anchors_list,
                    gt_labels,
                    gt_bboxes,
                    pad_gt_mask,
                    bg_index=self.num_classes,
                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
            alpha_l = 0.25
        else:
            if self.sm_use:
                # only used in smalldet of PPYOLOE-SOD model
                assigned_labels, assigned_bboxes, assigned_scores = \
                    self.assigner(
                    pred_scores.detach(),
                    pred_bboxes.detach() * stride_tensor,
                    anchor_points,
                    stride_tensor,
                    gt_labels,
                    gt_bboxes,
                    pad_gt_mask,
                    bg_index=self.num_classes)
            else:
                if aux_pred is None:
                    if not hasattr(self, "assigned_labels"):
                        assigned_labels, assigned_bboxes, assigned_scores = \
                            self.assigner(
                            pred_scores.detach(),
                            pred_bboxes.detach() * stride_tensor,
                            anchor_points,
                            num_anchors_list,
                            gt_labels,
                            gt_bboxes,
                            pad_gt_mask,
                            bg_index=self.num_classes)
                        if self.for_distill:
                            self.assigned_labels = assigned_labels
                            self.assigned_bboxes = assigned_bboxes
                            self.assigned_scores = assigned_scores

                    else:
                        # only used in distill
                        assigned_labels = self.assigned_labels
                        assigned_bboxes = self.assigned_bboxes
                        assigned_scores = self.assigned_scores

                else:
                    assigned_labels, assigned_bboxes, assigned_scores = \
                            self.assigner(
                            pred_scores_aux.detach(),
                            pred_bboxes_aux.detach() * stride_tensor,
                            anchor_points,
                            num_anchors_list,
                            gt_labels,
                            gt_bboxes,
                            pad_gt_mask,
                            bg_index=self.num_classes)
            alpha_l = -1
        # rescale bbox
        assigned_bboxes /= stride_tensor

        assign_out_dict = self.get_loss_from_assign(
            pred_scores, pred_distri, pred_bboxes, anchor_points_s,
            assigned_labels, assigned_bboxes, assigned_scores, alpha_l)

        if aux_pred is not None:
            assign_out_dict_aux = self.get_loss_from_assign(
                aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s,
                assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
            loss = {}
            for key in assign_out_dict.keys():
                loss[key] = assign_out_dict[key] + assign_out_dict_aux[key]
        else:
            loss = assign_out_dict

        return loss

    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
                             anchor_points_s, assigned_labels, assigned_bboxes,
                             assigned_scores, alpha_l):
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = F.one_hot(assigned_labels,
                                      self.num_classes + 1)[..., :-1]
            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
                                            one_hot_label)
        else:
            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)

        assigned_scores_sum = assigned_scores.sum()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(assigned_scores_sum)
            assigned_scores_sum /= paddle.distributed.get_world_size()
        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
        loss_cls /= assigned_scores_sum

        if self.for_distill:
            self.distill_pairs['pred_cls_scores'] = pred_scores
            self.distill_pairs['pos_num'] = assigned_scores_sum
            self.distill_pairs['assigned_scores'] = assigned_scores

            one_hot_label = F.one_hot(assigned_labels,
                                      self.num_classes + 1)[..., :-1]
            self.distill_pairs['target_labels'] = one_hot_label

        loss_l1, loss_iou, loss_dfl = \
            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
                            assigned_labels, assigned_bboxes, assigned_scores,
                            assigned_scores_sum)
        loss = self.loss_weight['class'] * loss_cls + \
               self.loss_weight['iou'] * loss_iou + \
               self.loss_weight['dfl'] * loss_dfl
        out_dict = {
            'loss': loss,
            'loss_cls': loss_cls,
            'loss_iou': loss_iou,
            'loss_dfl': loss_dfl,
            'loss_l1': loss_l1,
        }
        return out_dict

    def post_process(self, head_outs, scale_factor):
        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
        pred_bboxes *= stride_tensor
        if self.exclude_post_process:
            return paddle.concat(
                [pred_bboxes, pred_scores.transpose([0, 2, 1])],
                axis=-1), None, None
        else:
            # scale bbox to origin
            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
            scale_factor = paddle.concat(
                [scale_x, scale_y, scale_x, scale_y],
                axis=-1).reshape([-1, 1, 4])
            pred_bboxes /= scale_factor
            if self.exclude_nms:
                # `exclude_nms=True` just use in benchmark
                return pred_bboxes, pred_scores, None
            else:
                bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
                                                             pred_scores)
                return bbox_pred, bbox_num, nms_keep_idx


def get_activation(name="LeakyReLU"):
    if name == "silu":
        module = nn.Silu()
    elif name == "relu":
        module = nn.ReLU()
    elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']:
        module = nn.LeakyReLU(0.1)
    elif name is None:
        module = nn.Identity()
    else:
        raise AttributeError("Unsupported act type: {}".format(name))
    return module


class ConvNormLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 norm_type='gn',
                 activation="LeakyReLU"):
        super(ConvNormLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None]
        self.conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias_attr=False,
            weight_attr=ParamAttr(initializer=KaimingNormal()))

        if norm_type in ['bn', 'sync_bn', 'syncbn']:
            self.norm = nn.BatchNorm2D(out_channels)
        elif norm_type == 'gn':
            self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels)
        else:
            self.norm = None

        self.act = get_activation(activation)

    def forward(self, x):
        y = self.conv(x)
        if self.norm is not None:
            y = self.norm(y)
        y = self.act(y)
        return y


class ScaleReg(nn.Layer):
    """
    Parameter for scaling the regression outputs.
    """

    def __init__(self, scale=1.0):
        super(ScaleReg, self).__init__()
        scale = paddle.to_tensor(scale)
        self.scale = self.create_parameter(
            shape=[1],
            dtype='float32',
            default_initializer=nn.initializer.Assign(scale))

    def forward(self, x):
        return x * self.scale


@register
class SimpleConvHead(nn.Layer):
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 feat_in=288,
                 feat_out=288,
                 num_convs=1,
                 fpn_strides=[32, 16, 8, 4],
                 norm_type='gn',
                 act='LeakyReLU',
                 prior_prob=0.01,
                 reg_max=16):
        super(SimpleConvHead, self).__init__()
        self.num_classes = num_classes
        self.feat_in = feat_in
        self.feat_out = feat_out
        self.num_convs = num_convs
        self.fpn_strides = fpn_strides
        self.reg_max = reg_max

        self.cls_convs = nn.LayerList()
        self.reg_convs = nn.LayerList()
        for i in range(self.num_convs):
            in_c = feat_in if i == 0 else feat_out
            self.cls_convs.append(
                ConvNormLayer(
                    in_c,
                    feat_out,
                    3,
                    stride=1,
                    padding=1,
                    norm_type=norm_type,
                    activation=act))
            self.reg_convs.append(
                ConvNormLayer(
                    in_c,
                    feat_out,
                    3,
                    stride=1,
                    padding=1,
                    norm_type=norm_type,
                    activation=act))

        bias_cls = bias_init_with_prob(prior_prob)
        self.gfl_cls = nn.Conv2D(
            feat_out,
            self.num_classes,
            kernel_size=3,
            stride=1,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0.0, std=0.01)),
            bias_attr=ParamAttr(initializer=Constant(value=bias_cls)))
        self.gfl_reg = nn.Conv2D(
            feat_out,
            4 * (self.reg_max + 1),
            kernel_size=3,
            stride=1,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0.0, std=0.01)),
            bias_attr=ParamAttr(initializer=Constant(value=0)))

        self.scales = nn.LayerList()
        for i in range(len(self.fpn_strides)):
            self.scales.append(ScaleReg(1.0))

    def forward(self, feats):
        cls_scores = []
        bbox_preds = []
        for x, scale in zip(feats, self.scales):
            cls_feat = x
            reg_feat = x
            for cls_conv in self.cls_convs:
                cls_feat = cls_conv(cls_feat)
            for reg_conv in self.reg_convs:
                reg_feat = reg_conv(reg_feat)

            cls_score = self.gfl_cls(cls_feat)
            cls_score = F.sigmoid(cls_score)
            cls_score = cls_score.flatten(2).transpose([0, 2, 1])
            cls_scores.append(cls_score)

            bbox_pred = scale(self.gfl_reg(reg_feat))
            bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
            bbox_preds.append(bbox_pred)

        cls_scores = paddle.concat(cls_scores, axis=1)
        bbox_preds = paddle.concat(bbox_preds, axis=1)
        return cls_scores, bbox_preds


================================================
FILE: ppdet/modeling/heads/ppyoloe_ins_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ppdet.modeling.backbones.csp_darknet import BaseConv
from ppdet.modeling.layers import MultiClassNMS
from ppdet.modeling.ops import get_static_shape, get_act_fn
from .ppyoloe_head import ESEAttn
from ..assigners.utils import generate_anchors_for_grid_cell
from ..bbox_utils import batch_distance2bbox
from ..initializer import bias_init_with_prob, constant_
from ..losses import GIoULoss

__all__ = ['PPYOLOEInsHead']


def custom_binary_cross_entropy_with_logits(x, y):
    max_val = paddle.maximum(-x, paddle.to_tensor(0.0))
    loss = (1 - y) * x + max_val + paddle.log(
        paddle.exp(-max_val) + paddle.exp(-x - max_val))
    return loss


class MaskProto(nn.Layer):
    # YOLOv8 mask Proto module for instance segmentation models
    def __init__(self, ch_in, num_protos=256, num_masks=32, act='silu'):
        super().__init__()
        self.conv1 = BaseConv(ch_in, num_protos, 3, 1, act=act)
        self.upsample = nn.Conv2DTranspose(num_protos,
                                           num_protos,
                                           2,
                                           2,
                                           0,
                                           bias_attr=True)
        self.conv2 = BaseConv(num_protos, num_protos, 3, 1, act=act)
        self.conv3 = BaseConv(num_protos, num_masks, 1, 1, act=act)

    def forward(self, x):
        return self.conv3(self.conv2(self.upsample(self.conv1(x))))


def xyxy2xywh(x):
    """
    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
    top-left corner and (x2, y2) is the bottom-right corner.
    """
    assert x.shape[
        -1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
    y = paddle.empty_like(x) if isinstance(
        x, paddle.Tensor) else np.empty_like(x)  # faster than clone/copy
    y[..., 0] = (x[..., 0] + x[..., 2]) / 2  # x center
    y[..., 1] = (x[..., 1] + x[..., 3]) / 2  # y center
    y[..., 2] = x[..., 2] - x[..., 0]  # width
    y[..., 3] = x[..., 3] - x[..., 1]  # height
    return y


def crop_mask(masks, boxes):
    """
    It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box

    Args:
      masks (paddle.Tensor): [h, w, n] tensor of masks
      boxes (paddle.Tensor): [n, 4] tensor of bbox coordinates in relative point form

    Returns:
      (paddle.Tensor): The masks are being cropped to the bounding box.
    """
    _, h, w = masks.shape
    x1, y1, x2, y2 = paddle.chunk(boxes[:, :, None], 4, axis=1)
    r = paddle.arange(w, dtype=x1.dtype)[None, None, :]
    c = paddle.arange(h, dtype=y1.dtype)[None, :, None]
    if "npu" in paddle.device.get_all_custom_device_type():
        # bool tensor broadcast multiply is extreamly slow on npu, so we cast it to float32.
        m_dtype = masks.dtype
        return masks * ((r >= x1).cast(m_dtype) * (r < x2).cast(m_dtype) *
                        (c >= y1).cast(m_dtype) * (c < y2).cast(m_dtype))
    else:
        return masks * ((r >= x1) * (r < x2) * (c >= y1) *
                        (c < y2)).astype(masks.dtype)


def process_mask_upsample(protos, masks_in, bboxes, shape):
    """
    It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher
    quality but is slower.

    Args:
      protos (paddle.Tensor): [mask_dim, mask_h, mask_w]
      masks_in (paddle.Tensor): [n, mask_dim], n is number of masks after nms
      bboxes (paddle.Tensor): [n, 4], n is number of masks after nms
      shape (tuple): the size of the input image (h,w)

    Returns:
      (paddle.Tensor): The upsampled masks.
    """
    c, mh, mw = protos.shape  # CHW
    masks = F.sigmoid(masks_in @ protos.reshape([c, -1])).reshape([-1, mh, mw])
    masks = F.interpolate(masks[None],
                          shape,
                          mode='bilinear',
                          align_corners=False)[0]  # CHW
    masks = crop_mask(masks, bboxes)  # CHW
    return masks


@register
class PPYOLOEInsHead(nn.Layer):
    __shared__ = [
        'num_classes', 'eval_size', 'trt', 'exclude_nms',
        'exclude_post_process', 'use_shared_conv', 'for_distill', 'width_mult'
    ]
    __inject__ = ['static_assigner', 'assigner', 'nms']

    def __init__(self,
                 in_channels=[1024, 512, 256],
                 num_classes=80,
                 act='swish',
                 fpn_strides=(32, 16, 8),
                 grid_cell_scale=5.0,
                 grid_cell_offset=0.5,
                 reg_max=16,
                 reg_range=None,
                 static_assigner_epoch=4,
                 use_varifocal_loss=True,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 nms='MultiClassNMS',
                 eval_size=None,
                 loss_weight={
                     'class': 1.0,
                     'iou': 2.5,
                     'dfl': 0.5,
                 },
                 trt=False,
                 attn_conv='convbn',
                 exclude_nms=False,
                 exclude_post_process=False,
                 use_shared_conv=True,
                 mask_thr_binary=0.5,
                 num_masks=32,
                 num_protos=256,
                 width_mult=1.0,
                 for_distill=False):
        super(PPYOLOEInsHead, self).__init__()
        assert len(in_channels) > 0, "len(in_channels) should > 0"

        self.mask_thr_binary = mask_thr_binary
        self.num_masks = num_masks
        self.num_protos = int(num_protos * width_mult)

        self.in_channels = in_channels
        self.num_classes = num_classes
        self.fpn_strides = fpn_strides
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        if reg_range:
            self.sm_use = True
            self.reg_range = reg_range
        else:
            self.sm_use = False
            self.reg_range = (0, reg_max + 1)
        self.reg_channels = self.reg_range[1] - self.reg_range[0]
        self.iou_loss = GIoULoss()
        self.loss_weight = loss_weight
        self.use_varifocal_loss = use_varifocal_loss
        self.eval_size = eval_size

        self.static_assigner_epoch = static_assigner_epoch
        self.static_assigner = static_assigner
        self.assigner = assigner
        self.nms = nms
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.exclude_nms = exclude_nms
        self.exclude_post_process = exclude_post_process
        self.use_shared_conv = use_shared_conv
        self.for_distill = for_distill
        self.is_teacher = False

        # stem
        self.stem_cls = nn.LayerList()
        self.stem_reg = nn.LayerList()
        self.stem_ins = nn.LayerList()
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
            self.stem_ins.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
        # pred head
        self.pred_cls = nn.LayerList()
        self.pred_reg = nn.LayerList()
        self.pred_ins = nn.LayerList()
        for in_c in self.in_channels:
            self.pred_cls.append(
                nn.Conv2D(in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(
                nn.Conv2D(in_c, 4 * self.reg_channels, 3, padding=1))
            self.pred_ins.append(nn.Conv2D(in_c, self.num_masks, 3, padding=1))
        # projection conv
        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
        self.proj_conv.skip_quant = True
        self._init_weights()

        self.proto = MaskProto(in_channels[-1],
                               self.num_protos,
                               self.num_masks,
                               act=act)

        if self.for_distill:
            self.distill_pairs = {}

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
        }

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
            constant_(cls_.weight)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.weight)
            constant_(reg_.bias, 1.0)

        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
                               self.reg_channels).reshape(
                                   [1, self.reg_channels, 1, 1])
        self.proj_conv.weight.set_value(proj)
        self.proj_conv.weight.stop_gradient = True
        if self.eval_size:
            anchor_points, stride_tensor = self._generate_anchors()
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def forward_train(self, feats, targets):
        anchors, anchor_points, num_anchors_list, stride_tensor = \
            generate_anchors_for_grid_cell(
                feats, self.fpn_strides, self.grid_cell_scale,
                self.grid_cell_offset)

        cls_score_list, reg_distri_list = [], []
        mask_feat = self.proto(feats[-1])
        mask_coeff_list = []
        for i, feat in enumerate(feats):
            _, _, h, w = feat.shape
            l = h * w
            if "npu" in paddle.device.get_all_custom_device_type(
            ):  # backward in avgpool is extremely slow in npu kernel, replace it with mean
                avg_feat = feat.mean(axis=[2, 3], keepdim=True)
            else:
                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            msk_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +
                                         feat)

            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            mask_coeff_list.append(msk_coeff.flatten(2).transpose([0, 2,
                                                                   1]))  ###
            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
        cls_score_list = paddle.concat(cls_score_list, axis=1)
        mask_coeff_list = paddle.concat(mask_coeff_list, axis=1)
        reg_distri_list = paddle.concat(reg_distri_list, axis=1)

        return self.get_loss([
            cls_score_list, reg_distri_list, mask_coeff_list, mask_feat,
            anchors, anchor_points, num_anchors_list, stride_tensor
        ], targets)

    def _generate_anchors(self, feats=None, dtype='float32'):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = paddle.arange(end=w) + self.grid_cell_offset
            shift_y = paddle.arange(end=h) + self.grid_cell_offset
            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
            anchor_point = paddle.cast(paddle.stack([shift_x, shift_y],
                                                    axis=-1),
                                       dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
        anchor_points = paddle.concat(anchor_points)
        stride_tensor = paddle.concat(stride_tensor)
        return anchor_points, stride_tensor

    def forward_eval(self, feats):
        mask_proto = self.proto(feats[-1])

        if self.eval_size:
            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points, stride_tensor = self._generate_anchors(feats)
        cls_score_list, reg_dist_list, pred_mask_list = [], [], []
        feats_shapes = []
        for i, feat in enumerate(feats):
            _, _, h, w = feat.shape
            l = h * w
            feats_shapes.append(l)

            if "npu" in paddle.device.get_all_custom_device_type():
                # backward in avgpool is extremely slow in npu kernel, replace it with mean
                avg_feat = feat.mean(axis=[2, 3], keepdim=True)
            else:
                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            mask_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +
                                          feat)
            pred_mask_list.append(mask_coeff.reshape([-1, self.num_masks, l]))

            reg_dist = reg_dist.reshape([-1, 4, self.reg_channels,
                                         l]).transpose([0, 2, 3, 1])

            if self.use_shared_conv:
                reg_dist = self.proj_conv(F.softmax(reg_dist,
                                                    axis=1)).squeeze(1)
            else:
                reg_dist = F.softmax(reg_dist, axis=1)
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
            reg_dist_list.append(reg_dist)

        cls_score_list = paddle.concat(cls_score_list, axis=-1)
        pred_mask_list = paddle.concat(pred_mask_list, axis=-1)

        if self.use_shared_conv:
            reg_dist_list = paddle.concat(reg_dist_list, axis=1)
        else:
            reg_dist_list = paddle.concat(reg_dist_list, axis=2)
            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)

        return cls_score_list, reg_dist_list, pred_mask_list, mask_proto, anchor_points, stride_tensor

    def forward(self, feats, targets=None):
        assert len(feats) == len(self.fpn_strides), \
            "The size of feats is not equal to size of fpn_strides"
        if self.training:
            return self.forward_train(feats, targets)
        else:
            return self.forward_eval(feats)

    @staticmethod
    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
        weight = (score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = F.binary_cross_entropy(score,
                                      label,
                                      weight=weight,
                                      reduction='sum')
        return loss

    @staticmethod
    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
        loss = F.binary_cross_entropy(pred_score,
                                      gt_score,
                                      weight=weight,
                                      reduction='sum')
        return loss

    def _bbox_decode(self, anchor_points, pred_dist):
        _, l, _ = get_static_shape(pred_dist)
        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1,
                                                        2])).squeeze(1)
        return batch_distance2bbox(anchor_points, pred_dist)

    def _bbox_decode_fake(self, pred_dist):
        _, l, _ = get_static_shape(pred_dist)
        pred_dist_dfl = F.softmax(
            pred_dist.reshape([-1, l, 4, self.reg_channels]))
        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1,
                                                            2])).squeeze(1)
        return pred_dist, pred_dist_dfl

    def _bbox2distance(self, points, bbox):
        x1y1, x2y2 = paddle.split(bbox, 2, -1)
        lt = points - x1y1
        rb = x2y2 - points
        if "npu" in paddle.device.get_all_custom_device_type(
        ):  # npu clip kernel causes nan grad, replace it with maximum & minimum.
            out = paddle.concat([lt, rb], -1)
            out = paddle.maximum(
                out, paddle.to_tensor(self.reg_range[0], dtype=out.dtype))
            out = paddle.minimum(
                out,
                paddle.to_tensor(self.reg_range[1] - 1 - 0.01,
                                 dtype=out.dtype))
            return out
        else:
            return paddle.concat([lt, rb],
                                 -1).clip(self.reg_range[0],
                                          self.reg_range[1] - 1 - 0.01)

    def _df_loss(self, pred_dist, target, lower_bound=0):
        target_left = paddle.cast(target.floor(), 'int64')
        target_right = target_left + 1
        weight_left = target_right.astype('float32') - target
        weight_right = 1 - weight_left
        loss_left = F.cross_entropy(pred_dist,
                                    target_left - lower_bound,
                                    reduction='none') * weight_left
        loss_right = F.cross_entropy(pred_dist,
                                     target_right - lower_bound,
                                     reduction='none') * weight_right
        return (loss_left + loss_right).mean(-1, keepdim=True)

    def get_loss(self, head_outs, gt_meta):
        assert 'gt_bbox' in gt_meta and 'gt_class' in gt_meta
        assert 'gt_segm' in gt_meta

        pred_scores, pred_distri, pred_mask_coeffs, mask_proto, anchors, \
            anchor_points, num_anchors_list, stride_tensor = head_outs

        bs = pred_scores.shape[0]
        imgsz = paddle.to_tensor(
            [640, 640]
        )  # paddle.to_tensor(pred_scores[0].shape[2:]) * self.fpn_strides[0]  # image size (h,w)
        mask_h, mask_w = mask_proto.shape[-2:]

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)

        gt_labels = paddle.stack(gt_meta['gt_class'])
        gt_bboxes = paddle.stack(gt_meta['gt_bbox'])
        pad_gt_mask = paddle.stack(gt_meta['pad_gt_mask'])
        gt_segms = paddle.stack(gt_meta['gt_segm']).cast('float32')
        if tuple(gt_segms.shape[-2:]) != (mask_h, mask_w):  # downsample
            gt_segms = F.interpolate(gt_segms, (mask_h, mask_w),
                                     mode='nearest').reshape(
                                         [bs, -1, mask_h * mask_w])

        # label assignment
        assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index = \
            self.assigner(
                pred_scores.detach(),
                pred_bboxes.detach() * stride_tensor,
                anchor_points,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes,
                gt_segms=gt_segms)
        # rescale bbox
        assigned_bboxes /= stride_tensor

        # assign segms for masks
        assigned_masks = paddle.gather(gt_segms.reshape([-1, mask_h * mask_w]),
                                       assigned_gt_index.flatten(),
                                       axis=0)
        assigned_masks = assigned_masks.reshape(
            [bs, assigned_gt_index.shape[1], mask_h * mask_w])

        assign_out_dict = self.get_loss_from_assign(
            pred_scores, pred_distri, pred_bboxes, anchor_points_s,
            assigned_labels, assigned_bboxes, assigned_scores, assigned_masks,
            pred_mask_coeffs, mask_proto, stride_tensor, imgsz)

        loss = assign_out_dict
        return loss

    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
                             anchor_points_s, assigned_labels, assigned_bboxes,
                             assigned_scores, assigned_masks, pred_mask_coeffs,
                             mask_proto, stride_tensor, imgsz):
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = F.one_hot(assigned_labels,
                                      self.num_classes + 1)[..., :-1]
            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
                                            one_hot_label)
        else:
            loss_cls = self._focal_loss(pred_scores,
                                        assigned_scores,
                                        alpha_l=-1)

        assigned_scores_sum = assigned_scores.sum()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(assigned_scores_sum)
            assigned_scores_sum /= paddle.distributed.get_world_size()
        if "npu" in paddle.device.get_all_custom_device_type():
            # npu clip kernel causes nan grad, replace it with maximum & minimum.
            assigned_scores_sum = paddle.maximum(
                assigned_scores_sum,
                paddle.to_tensor(1., dtype=assigned_scores_sum.dtype))
        else:
            assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)

        loss_cls /= assigned_scores_sum

        # select positive samples mask
        mask_positive = (assigned_labels != self.num_classes)
        num_pos = mask_positive.sum()
        # pos/neg loss
        if num_pos > 0:
            # l1 + iou
            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
                [1, 1, 4]).astype('bool')
            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
                                                   bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = paddle.masked_select(
                assigned_bboxes, bbox_mask).reshape([-1, 4])
            bbox_weight = paddle.masked_select(assigned_scores.sum(-1),
                                               mask_positive).unsqueeze(-1)

            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)

            loss_iou = self.iou_loss(pred_bboxes_pos,
                                     assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / assigned_scores_sum

            # dfl loss
            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
                [1, 1, self.reg_channels * 4]).astype('bool')
            pred_dist_pos = paddle.masked_select(pred_distri,
                                                 dist_mask).reshape([
                                                     -1, 4, self.reg_channels
                                                 ])  # pred_dist in funs
            assigned_ltrb = self._bbox2distance(
                anchor_points_s, assigned_bboxes)  # anchor_points in func
            assigned_ltrb_pos = paddle.masked_select(
                assigned_ltrb, bbox_mask).reshape([-1, 4])
            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
                                     self.reg_range[0]) * bbox_weight
            loss_dfl = loss_dfl.sum() / assigned_scores_sum

            # mask loss
            loss_mask = self.calculate_segmentation_loss(
                mask_positive, assigned_masks, assigned_bboxes * stride_tensor,
                mask_proto, pred_mask_coeffs, imgsz)
            # [bs, 8400] [bs, 8400, 160 * 160] [bs, 8400, 4] [bs, 32, 160, 160] [bs, 8400, 32]
            loss_mask /= assigned_scores_sum
        else:
            loss_l1 = paddle.zeros([1])
            loss_iou = paddle.zeros([1])
            loss_mask = paddle.zeros([1])
            loss_dfl = paddle.zeros([1])

        loss = self.loss_weight['class'] * loss_cls + \
               self.loss_weight['iou'] * loss_iou + \
               self.loss_weight['dfl'] * loss_dfl + \
               self.loss_weight['iou'] * loss_mask

        out_dict = {
            'loss': loss,
            'loss_cls': loss_cls,
            'loss_iou': loss_iou,
            'loss_dfl': loss_dfl,
            'loss_mask': loss_mask,
            'loss_l1': loss_l1,
        }
        return out_dict

    def calculate_segmentation_loss(self,
                                    fg_mask,
                                    masks,
                                    target_bboxes,
                                    proto,
                                    pred_masks,
                                    imgsz,
                                    overlap=True):
        """
        Calculate the loss for instance segmentation.

        Args:
            fg_mask (paddle.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.
            masks (paddle.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).
            target_gt_idx (paddle.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).
            target_bboxes (paddle.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).
            batch_idx (paddle.Tensor): Batch indices of shape (N_labels_in_batch, 1).
            proto (paddle.Tensor): Prototype masks of shape (BS, 32, H, W).
            pred_masks (paddle.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).
            imgsz (paddle.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).
            overlap (bool): Whether the masks in `masks` tensor overlap.

        Returns:
            (paddle.Tensor): The calculated loss for instance segmentation.

        Notes:
            The batch loss can be computed for improved speed at higher memory usage.
            For example, pred_mask can be computed as follows:
                pred_mask = paddle.einsum('in,nhw->ihw', pred, proto)  # (i, 32) @ (32, 160, 160) -> (i, 160, 160)
        """
        _, _, mask_h, mask_w = proto.shape
        loss = paddle.to_tensor([0.])

        # Normalize to 0-1
        target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]].cast(
            target_bboxes.dtype)
        # [8, 8400, 4]

        # Areas of target bboxes
        marea = xyxy2xywh(target_bboxes_normalized)[...,
                                                    2:].prod(2).unsqueeze(-1)

        # Normalize to mask size
        mxyxy = target_bboxes_normalized * paddle.to_tensor(
            [mask_w, mask_h, mask_w, mask_h],
            dtype=target_bboxes_normalized.dtype)

        for i, single_i in enumerate(
                zip(fg_mask, pred_masks, proto, mxyxy, marea, masks)):
            fg_mask_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i
            #  [8400] [8400, 32] [32, 160, 160] [8400, 4]  [8400, 1]  [8400, 25600]
            if fg_mask_i.any():
                loss += self.single_mask_loss(masks_i[fg_mask_i],
                                              pred_masks_i[fg_mask_i], proto_i,
                                              mxyxy_i[fg_mask_i],
                                              marea_i[fg_mask_i])
                # [10, 25600]  [10, 32]  [32, 160, 160]  [10, 4]  [10, 1]
            else:
                loss += (proto * 0).sum() + (
                    pred_masks * 0).sum()  # inf sums may lead to nan loss
        return loss

    @staticmethod
    def single_mask_loss(gt_mask, pred, proto, xyxy, area):
        """
        Compute the instance segmentation loss for a single image.
        Args:
            gt_mask (paddle.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects.
            pred (paddle.Tensor): Predicted mask coefficients of shape (n, 32).
            proto (paddle.Tensor): Prototype masks of shape (32, H, W).
            xyxy (paddle.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4).
            area (paddle.Tensor): Area of each ground truth bounding box of shape (n,).
        Returns:
            (paddle.Tensor): The calculated mask loss for a single image.

        Notes:
            The function uses the equation pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) to produce the
            predicted masks from the prototype masks and predicted mask coefficients.
        """
        nt = pred.shape[0]
        gt_mask = gt_mask.reshape([nt, *proto.shape[1:]])
        nmasks = 32
        pred_mask = (pred @ proto.reshape([nmasks, -1])).reshape(
            [-1, *proto.shape[1:]])  # (n,32) @ (32,80,80) -> (n,80,80)

        if "npu" in paddle.device.get_all_custom_device_type():
            # bce npu kernel causes nan grad, replace it with numeric stable custom implementation.
            loss = custom_binary_cross_entropy_with_logits(pred_mask, gt_mask)
        else:
            loss = F.binary_cross_entropy_with_logits(pred_mask,
                                                      gt_mask,
                                                      reduction='none')
        return (crop_mask(loss, xyxy).mean(axis=(1, 2)) /
                area.squeeze(-1)).sum()

    def post_process(self,
                     head_outs,
                     im_shape,
                     scale_factor,
                     infer_shape=[640, 640],
                     rescale=True):
        pred_scores, pred_dist, pred_mask_coeffs, mask_feat, anchor_points, stride_tensor = head_outs

        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
        pred_bboxes *= stride_tensor

        if self.exclude_post_process:
            return paddle.concat([
                pred_bboxes,
                pred_scores.transpose([0, 2, 1]),
                pred_mask_coeffs.transpose([0, 2, 1])
            ],
                                 axis=-1), mask_feat, None
            # [1, 8400, 4+80+32], [1, 32, 160, 160]

        bbox_pred, bbox_num, keep_idxs = self.nms(pred_bboxes, pred_scores)

        if bbox_num.sum() > 0:
            pred_mask_coeffs = pred_mask_coeffs.transpose([0, 2, 1])
            mask_coeffs = paddle.gather(
                pred_mask_coeffs.reshape([-1, self.num_masks]), keep_idxs)

            mask_logits = process_mask_upsample(mask_feat[0], mask_coeffs,
                                                bbox_pred[:, 2:6], infer_shape)
            if rescale:
                ori_h, ori_w = im_shape[0] / scale_factor[0]
                mask_logits = F.interpolate(
                    mask_logits.unsqueeze(0),
                    size=[
                        int(paddle.round(mask_logits.shape[-2] /
                              scale_factor[0][0])),
                        int(paddle.round(mask_logits.shape[-1] /
                              scale_factor[0][1]))
                    ],
                    mode='bilinear',
                    align_corners=False)
                if "npu" in paddle.device.get_all_custom_device_type():
                    # due to npu numeric error, we need to take round of img size.
                    mask_logits = mask_logits[
                        ..., :round(ori_h.item()), :round(ori_w.item())]
                else:
                    mask_logits = mask_logits[..., :int(ori_h), :int(ori_w)]

            masks = mask_logits.squeeze(0)
            mask_pred = paddle.to_tensor(masks > self.mask_thr_binary).cast("float32")

            # scale bbox to origin
            scale_factor = scale_factor.flip(-1).tile([1, 2])
            bbox_pred[:, 2:6] /= scale_factor
        else:
            ori_h, ori_w = im_shape[0] / scale_factor[0]
            bbox_num = paddle.to_tensor([1]).cast("int32")
            bbox_pred = paddle.zeros([bbox_num, 6])
            mask_pred = paddle.zeros([bbox_num, int(ori_h), int(ori_w)])

        return bbox_pred, bbox_num, mask_pred, keep_idxs

================================================
FILE: ppdet/modeling/heads/ppyoloe_r_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

from ..losses import ProbIoULoss
from ..initializer import bias_init_with_prob, constant_, normal_, vector_
from ppdet.modeling.backbones.cspresnet import ConvBNLayer
from ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator
from ppdet.modeling.layers import MultiClassNMS

__all__ = ['PPYOLOERHead']


class ESEAttn(nn.Layer):
    def __init__(self, feat_channels, act='swish'):
        super(ESEAttn, self).__init__()
        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
        self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)

        self._init_weights()

    def _init_weights(self):
        normal_(self.fc.weight, std=0.01)

    def forward(self, feat, avg_feat):
        weight = F.sigmoid(self.fc(avg_feat))
        return self.conv(feat * weight)


@register
class PPYOLOERHead(nn.Layer):
    __shared__ = ['num_classes', 'trt', 'export_onnx']
    __inject__ = ['static_assigner', 'assigner', 'nms']

    def __init__(self,
                 in_channels=[1024, 512, 256],
                 num_classes=15,
                 act='swish',
                 fpn_strides=(32, 16, 8),
                 grid_cell_offset=0.5,
                 angle_max=90,
                 use_varifocal_loss=True,
                 static_assigner_epoch=4,
                 trt=False,
                 export_onnx=False,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 nms='MultiClassNMS',
                 loss_weight={'class': 1.0,
                              'iou': 2.5,
                              'dfl': 0.05}):
        super(PPYOLOERHead, self).__init__()
        assert len(in_channels) > 0, "len(in_channels) should > 0"
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.fpn_strides = fpn_strides
        self.grid_cell_offset = grid_cell_offset
        self.angle_max = angle_max
        self.loss_weight = loss_weight
        self.use_varifocal_loss = use_varifocal_loss
        self.half_pi = paddle.to_tensor(
            [1.5707963267948966], dtype=paddle.float32)
        self.half_pi_bin = self.half_pi / angle_max
        self.iou_loss = ProbIoULoss()
        self.static_assigner_epoch = static_assigner_epoch
        self.static_assigner = static_assigner
        self.assigner = assigner
        self.nms = nms
        # stem
        self.stem_cls = nn.LayerList()
        self.stem_reg = nn.LayerList()
        self.stem_angle = nn.LayerList()
        trt = False if export_onnx else trt
        self.export_onnx = export_onnx
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        self.trt = trt
        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, act=act))
            self.stem_reg.append(ESEAttn(in_c, act=act))
            self.stem_angle.append(ESEAttn(in_c, act=act))
        # pred head
        self.pred_cls = nn.LayerList()
        self.pred_reg = nn.LayerList()
        self.pred_angle = nn.LayerList()
        for in_c in self.in_channels:
            self.pred_cls.append(
                nn.Conv2D(
                    in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1))
            self.pred_angle.append(
                nn.Conv2D(
                    in_c, self.angle_max + 1, 3, padding=1))
        self.angle_proj_conv = nn.Conv2D(
            self.angle_max + 1, 1, 1, bias_attr=False)
        self._init_weights()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        bias_angle = [10.] + [1.] * self.angle_max
        for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg,
                                      self.pred_angle):
            normal_(cls_.weight, std=0.01)
            constant_(cls_.bias, bias_cls)
            normal_(reg_.weight, std=0.01)
            constant_(reg_.bias)
            constant_(angle_.weight)
            vector_(angle_.bias, bias_angle)

        angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1)
        self.angle_proj = angle_proj * self.half_pi_bin
        self.angle_proj_conv.weight.set_value(
            self.angle_proj.reshape([1, self.angle_max + 1, 1, 1]))
        self.angle_proj_conv.weight.stop_gradient = True

    def _generate_anchors(self, feats):
        if self.trt:
            anchor_points = []
            for feat, stride in zip(feats, self.fpn_strides):
                _, _, h, w = feat.shape
                anchor, _ = anchor_generator(
                    feat,
                    stride * 4,
                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
                    offset=0.5)
                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
                xc = (x1 + x2 + 1) / 2
                yc = (y1 + y2 + 1) / 2
                anchor_point = paddle.concat(
                    [xc, yc], axis=-1).reshape((1, h * w, 2))
                anchor_points.append(anchor_point)
            anchor_points = paddle.concat(anchor_points, axis=1)
            return anchor_points, None, None
        else:
            anchor_points = []
            stride_tensor = []
            num_anchors_list = []
            for feat, stride in zip(feats, self.fpn_strides):
                _, _, h, w = feat.shape
                shift_x = (paddle.arange(end=w) + 0.5) * stride
                shift_y = (paddle.arange(end=h) + 0.5) * stride
                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
                anchor_point = paddle.cast(
                    paddle.stack(
                        [shift_x, shift_y], axis=-1), dtype='float32')
                anchor_points.append(anchor_point.reshape([1, -1, 2]))
                stride_tensor.append(
                    paddle.full(
                        [1, h * w, 1], stride, dtype='float32'))
                num_anchors_list.append(h * w)
            anchor_points = paddle.concat(anchor_points, axis=1)
            stride_tensor = paddle.concat(stride_tensor, axis=1)
            return anchor_points, stride_tensor, num_anchors_list

    def forward(self, feats, targets=None):
        assert len(feats) == len(self.fpn_strides), \
            "The size of feats is not equal to size of fpn_strides"

        if self.training:
            return self.forward_train(feats, targets)
        else:
            return self.forward_eval(feats)

    def forward_train(self, feats, targets):
        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
            feats)

        cls_score_list, reg_dist_list, reg_angle_list = [], [], []
        for i, feat in enumerate(feats):
            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1]))
            reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1]))
        cls_score_list = paddle.concat(cls_score_list, axis=1)
        reg_dist_list = paddle.concat(reg_dist_list, axis=1)
        reg_angle_list = paddle.concat(reg_angle_list, axis=1)

        return self.get_loss([
            cls_score_list, reg_dist_list, reg_angle_list, anchor_points,
            num_anchors_list, stride_tensor
        ], targets)

    def forward_eval(self, feats):
        cls_score_list, reg_box_list = [], []
        anchor_points, _, _ = self._generate_anchors(feats)
        for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)):
            b, _, h, w = feat.shape
            l = h * w
            # cls
            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
                                         feat)
            # reg
            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1)
            reg_xy = reg_xy * stride
            reg_wh = (F.elu(reg_wh) + 1.) * stride
            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
            reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1))
            reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
            # cls and reg
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.reshape([b, self.num_classes, l]))
            reg_box_list.append(reg_box.reshape([b, 5, l]))

        cls_score_list = paddle.concat(cls_score_list, axis=-1)
        reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1])
        reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1)
        reg_xy = reg_xy + anchor_points
        reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1)
        return cls_score_list, reg_box_list

    def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor):
        # predict vector to x, y, w, h, angle
        b, l = pred_angle.shape[:2]
        xy, wh = paddle.split(pred_dist, 2, axis=-1)
        xy = xy * stride_tensor + points
        wh = (F.elu(wh) + 1.) * stride_tensor
        angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1
                                              ])).matmul(self.angle_proj)
        return paddle.concat([xy, wh, angle], axis=-1)

    def get_loss(self, head_outs, gt_meta):
        pred_scores, pred_dist, pred_angle, \
        anchor_points, num_anchors_list, stride_tensor = head_outs
        # [B, N, 5] -> [B, N, 5]
        pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle,
                                        stride_tensor)
        gt_labels = gt_meta['gt_class']
        # [B, N, 5]
        gt_bboxes = gt_meta['gt_rbox']
        pad_gt_mask = gt_meta['pad_gt_mask']
        # label assignment
        if gt_meta['epoch_id'] < self.static_assigner_epoch:
            assigned_labels, assigned_bboxes, assigned_scores = \
                self.static_assigner(
                    anchor_points,
                    stride_tensor,
                    num_anchors_list,
                    gt_labels,
                    gt_meta['gt_bbox'],
                    gt_bboxes,
                    pad_gt_mask,
                    self.num_classes,
                    pred_bboxes.detach()
                )
        else:
            assigned_labels, assigned_bboxes, assigned_scores = \
                self.assigner(
                pred_scores.detach(),
                pred_bboxes.detach(),
                anchor_points,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes)
        alpha_l = -1
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = F.one_hot(assigned_labels,
                                      self.num_classes + 1)[..., :-1]
            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
                                            one_hot_label)
        else:
            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)

        assigned_scores_sum = assigned_scores.sum()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(assigned_scores_sum)
            assigned_scores_sum = paddle.clip(
                assigned_scores_sum / paddle.distributed.get_world_size(),
                min=1.)
        else:
            assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
        loss_cls /= assigned_scores_sum

        loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes,
                                             anchor_points, assigned_labels,
                                             assigned_bboxes, assigned_scores,
                                             assigned_scores_sum, stride_tensor)

        loss = self.loss_weight['class'] * loss_cls + \
               self.loss_weight['iou'] * loss_iou + \
               self.loss_weight['dfl'] * loss_dfl
        out_dict = {
            'loss': loss,
            'loss_cls': loss_cls,
            'loss_iou': loss_iou,
            'loss_dfl': loss_dfl
        }
        return out_dict

    @staticmethod
    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
        weight = (score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = F.binary_cross_entropy(
            score, label, weight=weight, reduction='sum')
        return loss

    @staticmethod
    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
        loss = F.binary_cross_entropy(
            pred_score, gt_score, weight=weight, reduction='sum')
        return loss

    @staticmethod
    def _df_loss(pred_dist, target):
        target_left = paddle.cast(target, 'int64')
        target_right = target_left + 1
        weight_left = target_right.astype('float32') - target
        weight_right = 1 - weight_left
        loss_left = F.cross_entropy(
            pred_dist, target_left, reduction='none') * weight_left
        loss_right = F.cross_entropy(
            pred_dist, target_right, reduction='none') * weight_right
        return (loss_left + loss_right).mean(-1, keepdim=True)

    def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points,
                   assigned_labels, assigned_bboxes, assigned_scores,
                   assigned_scores_sum, stride_tensor):
        # select positive samples mask
        mask_positive = (assigned_labels != self.num_classes)
        num_pos = mask_positive.sum()
        # pos/neg loss
        if num_pos > 0:
            # iou
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
                                                   bbox_mask).reshape([-1, 5])
            assigned_bboxes_pos = paddle.masked_select(
                assigned_bboxes, bbox_mask).reshape([-1, 5])
            bbox_weight = paddle.masked_select(
                assigned_scores.sum(-1), mask_positive).reshape([-1])

            loss_iou = self.iou_loss(pred_bboxes_pos,
                                     assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / assigned_scores_sum

            # dfl
            angle_mask = mask_positive.unsqueeze(-1).tile(
                [1, 1, self.angle_max + 1])
            pred_angle_pos = paddle.masked_select(
                pred_angle, angle_mask).reshape([-1, self.angle_max + 1])
            assigned_angle_pos = (
                assigned_bboxes_pos[:, 4] /
                self.half_pi_bin).clip(0, self.angle_max - 0.01)
            loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos)
        else:
            loss_iou = pred_bboxes.sum() * 0.
            loss_dfl = paddle.zeros([1])

        return loss_iou, loss_dfl

    def _box2corners(self, pred_bboxes):
        """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)

        Args:
            pred_bboxes (Tensor): [B, N, 5]
        
        Returns:
            polys (Tensor): [B, N, 8]
        """
        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
        cos_a_half = paddle.cos(angle) * 0.5
        sin_a_half = paddle.sin(angle) * 0.5
        w_x = cos_a_half * w
        w_y = sin_a_half * w
        h_x = -sin_a_half * h
        h_y = cos_a_half * h
        return paddle.concat(
            [
                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
            ],
            axis=-1)

    def post_process(self, head_outs, scale_factor):
        pred_scores, pred_bboxes = head_outs
        # [B, N, 5] -> [B, N, 8]
        pred_bboxes = self._box2corners(pred_bboxes)
        # scale bbox to origin
        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
        scale_factor = paddle.concat(
            [
                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
                scale_y
            ],
            axis=-1).reshape([-1, 1, 8])
        pred_bboxes /= scale_factor
        if self.export_onnx:
            return pred_bboxes, pred_scores, None
        bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
                                                           pred_scores)
        return bbox_pred, bbox_num, nms_keep_idx


================================================
FILE: ppdet/modeling/heads/retina_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.bbox_utils import bbox2delta, delta2bbox
from ppdet.modeling.heads.fcos_head import FCOSFeat

from ppdet.core.workspace import register

__all__ = ['RetinaHead']


@register
class RetinaFeat(FCOSFeat):
    """We use FCOSFeat to construct conv layers in RetinaNet.
    We rename FCOSFeat to RetinaFeat to avoid confusion.
    """
    pass


@register
class RetinaHead(nn.Layer):
    """Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf
    """
    __shared__ = ['num_classes']
    __inject__ = [
        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
        'loss_bbox', 'nms'
    ]

    def __init__(self,
                 num_classes=80,
                 conv_feat='RetinaFeat',
                 anchor_generator='RetinaAnchorGenerator',
                 bbox_assigner='MaxIoUAssigner',
                 loss_class='FocalLoss',
                 loss_bbox='SmoothL1Loss',
                 nms='MultiClassNMS',
                 prior_prob=0.01,
                 nms_pre=1000,
                 weights=[1., 1., 1., 1.]):
        super(RetinaHead, self).__init__()
        self.num_classes = num_classes
        self.conv_feat = conv_feat
        self.anchor_generator = anchor_generator
        self.bbox_assigner = bbox_assigner
        self.loss_class = loss_class
        self.loss_bbox = loss_bbox
        self.nms = nms
        self.nms_pre = nms_pre
        self.weights = weights

        bias_init_value = -math.log((1 - prior_prob) / prior_prob)
        num_anchors = self.anchor_generator.num_anchors
        self.retina_cls = nn.Conv2D(
            in_channels=self.conv_feat.feat_out,
            out_channels=self.num_classes * num_anchors,
            kernel_size=3,
            stride=1,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0.0, std=0.01)),
            bias_attr=ParamAttr(initializer=Constant(value=bias_init_value)))
        self.retina_reg = nn.Conv2D(
            in_channels=self.conv_feat.feat_out,
            out_channels=4 * num_anchors,
            kernel_size=3,
            stride=1,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(
                mean=0.0, std=0.01)),
            bias_attr=ParamAttr(initializer=Constant(value=0)))

    def forward(self, neck_feats, targets=None):
        cls_logits_list = []
        bboxes_reg_list = []
        for neck_feat in neck_feats:
            conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat)
            cls_logits = self.retina_cls(conv_cls_feat)
            bbox_reg = self.retina_reg(conv_reg_feat)
            cls_logits_list.append(cls_logits)
            bboxes_reg_list.append(bbox_reg)

        if self.training:
            return self.get_loss([cls_logits_list, bboxes_reg_list], targets)
        else:
            return [cls_logits_list, bboxes_reg_list]

    def get_loss(self, head_outputs, targets):
        """Here we calculate loss for a batch of images.
        We assign anchors to gts in each image and gather all the assigned
        postive and negative samples. Then loss is calculated on the gathered
        samples.
        """
        cls_logits_list, bboxes_reg_list = head_outputs
        anchors = self.anchor_generator(cls_logits_list)
        anchors = paddle.concat(anchors)

        # matches: contain gt_inds
        # match_labels: -1(ignore), 0(neg) or 1(pos)
        matches_list, match_labels_list = [], []
        # assign anchors to gts, no sampling is involved
        for gt_bbox in targets['gt_bbox']:
            matches, match_labels = self.bbox_assigner(anchors, gt_bbox)
            matches_list.append(matches)
            match_labels_list.append(match_labels)

        # reshape network outputs
        cls_logits = [
            _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes])
            for _ in cls_logits_list
        ]
        bboxes_reg = [
            _.transpose([0, 2, 3, 1]).reshape([0, -1, 4])
            for _ in bboxes_reg_list
        ]
        cls_logits = paddle.concat(cls_logits, axis=1)
        bboxes_reg = paddle.concat(bboxes_reg, axis=1)

        cls_pred_list, cls_tar_list = [], []
        reg_pred_list, reg_tar_list = [], []
        # find and gather preds and targets in each image
        for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \
            zip(matches_list, match_labels_list, cls_logits, bboxes_reg,
                targets['gt_bbox'], targets['gt_class']):
            pos_mask = (match_labels == 1)
            neg_mask = (match_labels == 0)
            chosen_mask = paddle.logical_or(pos_mask, neg_mask)

            gt_class = gt_class.reshape([-1])
            bg_class = paddle.to_tensor(
                [self.num_classes], dtype=gt_class.dtype)
            # a trick to assign num_classes to negative targets
            gt_class = paddle.concat([gt_class, bg_class], axis=-1)
            matches = paddle.where(neg_mask,
                                   paddle.full_like(matches, gt_class.size - 1),
                                   matches)

            cls_pred = cls_logit[chosen_mask]
            cls_tar = gt_class[matches[chosen_mask]]
            reg_pred = bbox_reg[pos_mask].reshape([-1, 4])
            reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4])
            reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights)
            cls_pred_list.append(cls_pred)
            cls_tar_list.append(cls_tar)
            reg_pred_list.append(reg_pred)
            reg_tar_list.append(reg_tar)
        cls_pred = paddle.concat(cls_pred_list)
        cls_tar = paddle.concat(cls_tar_list)
        reg_pred = paddle.concat(reg_pred_list)
        reg_tar = paddle.concat(reg_tar_list)

        avg_factor = max(1.0, reg_pred.shape[0])
        cls_loss = self.loss_class(
            cls_pred, cls_tar, reduction='sum') / avg_factor

        if reg_pred.shape[0] == 0:
            reg_loss = paddle.zeros([])
            reg_loss.stop_gradient = False
        else:
            reg_loss = self.loss_bbox(
                reg_pred, reg_tar, reduction='sum') / avg_factor

        loss = cls_loss + reg_loss
        out_dict = {
            'loss_cls': cls_loss,
            'loss_reg': reg_loss,
            'loss': loss,
        }
        return out_dict

    def get_bboxes_single(self,
                          anchors,
                          cls_scores_list,
                          bbox_preds_list,
                          im_shape,
                          scale_factor,
                          rescale=True):
        assert len(cls_scores_list) == len(bbox_preds_list)
        mlvl_bboxes = []
        mlvl_scores = []
        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list,
                                                bbox_preds_list):
            cls_score = cls_score.reshape([-1, self.num_classes])
            bbox_pred = bbox_pred.reshape([-1, 4])
            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
                max_score = cls_score.max(axis=1)
                _, topk_inds = max_score.topk(self.nms_pre)
                bbox_pred = bbox_pred.gather(topk_inds)
                anchor = anchor.gather(topk_inds)
                cls_score = cls_score.gather(topk_inds)
            bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze()
            mlvl_bboxes.append(bbox_pred)
            mlvl_scores.append(F.sigmoid(cls_score))
        mlvl_bboxes = paddle.concat(mlvl_bboxes)
        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
        if rescale:
            mlvl_bboxes = mlvl_bboxes / paddle.concat(
                [scale_factor[::-1], scale_factor[::-1]])
        mlvl_scores = paddle.concat(mlvl_scores)
        mlvl_scores = mlvl_scores.transpose([1, 0])
        return mlvl_bboxes, mlvl_scores

    def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor):
        batch_bboxes = []
        batch_scores = []
        for img_id in range(cls_logits[0].shape[0]):
            num_lvls = len(cls_logits)
            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
            bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)]
            bboxes, scores = self.get_bboxes_single(
                anchors, cls_scores_list, bbox_preds_list, im_shape[img_id],
                scale_factor[img_id])
            batch_bboxes.append(bboxes)
            batch_scores.append(scores)
        batch_bboxes = paddle.stack(batch_bboxes, axis=0)
        batch_scores = paddle.stack(batch_scores, axis=0)
        return batch_bboxes, batch_scores

    def post_process(self, head_outputs, im_shape, scale_factor):
        cls_logits_list, bboxes_reg_list = head_outputs
        anchors = self.anchor_generator(cls_logits_list)
        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
        bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list]
        bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape,
                                     scale_factor)

        bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores)
        return bbox_pred, bbox_num, nms_keep_idx


    def get_scores_single(self, cls_scores_list):
        mlvl_logits = []
        for cls_score in  cls_scores_list:
            cls_score = cls_score.reshape([-1, self.num_classes])
            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
                max_score = cls_score.max(axis=1)
                _, topk_inds = max_score.topk(self.nms_pre)
                cls_score = cls_score.gather(topk_inds)

            mlvl_logits.append(cls_score)

        mlvl_logits = paddle.concat(mlvl_logits)
        mlvl_logits = mlvl_logits.transpose([1, 0])

        return mlvl_logits

    def decode_cls_logits(self, cls_logits_list):
        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
        batch_logits = []
        for img_id in range(cls_logits[0].shape[0]):
            num_lvls = len(cls_logits)
            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
            logits = self.get_scores_single(cls_scores_list)
            batch_logits.append(logits)
        batch_logits = paddle.stack(batch_logits, axis=0)
        return batch_logits


================================================
FILE: ppdet/modeling/heads/roi_extractor.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from ppdet.core.workspace import register
from ppdet.modeling import ops
import paddle.nn as nn


def _to_list(v):
    if not isinstance(v, (list, tuple)):
        return [v]
    return v


@register
class RoIAlign(nn.Layer):
    """
    RoI Align module

    For more details, please refer to the document of roi_align in
    in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py

    Args:
        resolution (int): The output size, default 14
        spatial_scale (float): Multiplicative spatial scale factor to translate
            ROI coords from their input scale to the scale used when pooling.
            default 0.0625
        sampling_ratio (int): The number of sampling points in the interpolation
            grid, default 0
        canconical_level (int): The referring level of FPN layer with 
            specified level. default 4
        canonical_size (int): The referring scale of FPN layer with 
            specified scale. default 224
        start_level (int): The start level of FPN layer to extract RoI feature,
            default 0
        end_level (int): The end level of FPN layer to extract RoI feature,
            default 3
        aligned (bool): Whether to add offset to rois' coord in roi_align.
            default false
    """

    def __init__(self,
                 resolution=14,
                 spatial_scale=0.0625,
                 sampling_ratio=0,
                 canconical_level=4,
                 canonical_size=224,
                 start_level=0,
                 end_level=3,
                 aligned=False):
        super(RoIAlign, self).__init__()
        self.resolution = resolution
        self.spatial_scale = _to_list(spatial_scale)
        self.sampling_ratio = sampling_ratio
        self.canconical_level = canconical_level
        self.canonical_size = canonical_size
        self.start_level = start_level
        self.end_level = end_level
        self.aligned = False # TODO: npu kernel do not support aligned=True

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'spatial_scale': [1. / i.stride for i in input_shape]}

    def forward(self, feats, roi, rois_num):
        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
        if len(feats) == 1:
            rois_feat = paddle.vision.ops.roi_align(
                x=feats[self.start_level],
                boxes=roi,
                boxes_num=rois_num,
                output_size=self.resolution,
                spatial_scale=self.spatial_scale[0],
                aligned=self.aligned)
        else:
            offset = 2
            k_min = self.start_level + offset
            k_max = self.end_level + offset
            if hasattr(paddle.vision.ops, "distribute_fpn_proposals"):
                distribute_fpn_proposals = getattr(paddle.vision.ops,
                                                   "distribute_fpn_proposals")
            else:
                distribute_fpn_proposals = ops.distribute_fpn_proposals
            rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals(
                roi,
                k_min,
                k_max,
                self.canconical_level,
                self.canonical_size,
                rois_num=rois_num)

            rois_feat_list = []
            for lvl in range(self.start_level, self.end_level + 1):
                roi_feat = paddle.vision.ops.roi_align(
                    x=feats[lvl],
                    boxes=rois_dist[lvl],
                    boxes_num=rois_num_dist[lvl],
                    output_size=self.resolution,
                    spatial_scale=self.spatial_scale[lvl],
                    sampling_ratio=self.sampling_ratio,
                    aligned=self.aligned)
                rois_feat_list.append(roi_feat)
            rois_feat_shuffle = paddle.concat(rois_feat_list)
            rois_feat = paddle.gather(rois_feat_shuffle, restore_index)

        return rois_feat


================================================
FILE: ppdet/modeling/heads/s2anet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py

import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner
from ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator
from ppdet.modeling.layers import AlignConv
from ..cls_utils import _get_class_default_kwargs
import numpy as np


@register
class S2ANetHead(nn.Layer):
    """
    S2Anet head
    Args:
        stacked_convs (int): number of stacked_convs
        feat_in (int): input channels of feat
        feat_out (int): output channels of feat
        num_classes (int): num_classes
        anchor_strides (list): stride of anchors
        anchor_scales (list): scale of anchors
        anchor_ratios (list): ratios of anchors
        target_means (list): target_means
        target_stds (list): target_stds
        align_conv_type (str): align_conv_type ['Conv', 'AlignConv']
        align_conv_size (int): kernel size of align_conv
        use_sigmoid_cls (bool): use sigmoid_cls or not
        reg_loss_weight (list): loss weight for regression
    """
    __shared__ = ['num_classes']
    __inject__ = ['anchor_assign', 'nms']

    def __init__(self,
                 stacked_convs=2,
                 feat_in=256,
                 feat_out=256,
                 num_classes=15,
                 anchor_strides=[8, 16, 32, 64, 128],
                 anchor_scales=[4],
                 anchor_ratios=[1.0],
                 target_means=0.0,
                 target_stds=1.0,
                 align_conv_type='AlignConv',
                 align_conv_size=3,
                 use_sigmoid_cls=True,
                 anchor_assign=_get_class_default_kwargs(RBoxAssigner),
                 reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1],
                 cls_loss_weight=[1.1, 1.05],
                 reg_loss_type='l1',
                 nms_pre=2000,
                 nms='MultiClassNMS'):
        super(S2ANetHead, self).__init__()
        self.stacked_convs = stacked_convs
        self.feat_in = feat_in
        self.feat_out = feat_out
        self.anchor_list = None
        self.anchor_scales = anchor_scales
        self.anchor_ratios = anchor_ratios
        self.anchor_strides = anchor_strides
        self.anchor_strides = paddle.to_tensor(anchor_strides)
        self.anchor_base_sizes = list(anchor_strides)
        self.means = paddle.ones(shape=[5]) * target_means
        self.stds = paddle.ones(shape=[5]) * target_stds
        assert align_conv_type in ['AlignConv', 'Conv', 'DCN']
        self.align_conv_type = align_conv_type
        self.align_conv_size = align_conv_size

        self.use_sigmoid_cls = use_sigmoid_cls
        self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1
        self.sampling = False
        self.anchor_assign = anchor_assign
        self.reg_loss_weight = reg_loss_weight
        self.cls_loss_weight = cls_loss_weight
        self.alpha = 1.0
        self.beta = 1.0
        self.reg_loss_type = reg_loss_type
        self.nms_pre = nms_pre
        self.nms = nms
        self.fake_bbox = paddle.to_tensor(
            np.array(
                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
                dtype='float32'))
        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))

        # anchor
        self.anchor_generators = []
        for anchor_base in self.anchor_base_sizes:
            self.anchor_generators.append(
                S2ANetAnchorGenerator(anchor_base, anchor_scales,
                                      anchor_ratios))

        self.anchor_generators = nn.LayerList(self.anchor_generators)
        self.fam_cls_convs = nn.Sequential()
        self.fam_reg_convs = nn.Sequential()

        for i in range(self.stacked_convs):
            chan_in = self.feat_in if i == 0 else self.feat_out

            self.fam_cls_convs.add_sublayer(
                'fam_cls_conv_{}'.format(i),
                nn.Conv2D(
                    in_channels=chan_in,
                    out_channels=self.feat_out,
                    kernel_size=3,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                    bias_attr=ParamAttr(initializer=Constant(0))))

            self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i),
                                            nn.ReLU())

            self.fam_reg_convs.add_sublayer(
                'fam_reg_conv_{}'.format(i),
                nn.Conv2D(
                    in_channels=chan_in,
                    out_channels=self.feat_out,
                    kernel_size=3,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                    bias_attr=ParamAttr(initializer=Constant(0))))

            self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i),
                                            nn.ReLU())

        self.fam_reg = nn.Conv2D(
            self.feat_out,
            5,
            1,
            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
            bias_attr=ParamAttr(initializer=Constant(0)))
        prior_prob = 0.01
        bias_init = float(-np.log((1 - prior_prob) / prior_prob))
        self.fam_cls = nn.Conv2D(
            self.feat_out,
            self.cls_out_channels,
            1,
            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
            bias_attr=ParamAttr(initializer=Constant(bias_init)))

        if self.align_conv_type == "AlignConv":
            self.align_conv = AlignConv(self.feat_out, self.feat_out,
                                        self.align_conv_size)
        elif self.align_conv_type == "Conv":
            self.align_conv = nn.Conv2D(
                self.feat_out,
                self.feat_out,
                self.align_conv_size,
                padding=(self.align_conv_size - 1) // 2,
                bias_attr=ParamAttr(initializer=Constant(0)))

        elif self.align_conv_type == "DCN":
            self.align_conv_offset = nn.Conv2D(
                self.feat_out,
                2 * self.align_conv_size**2,
                1,
                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                bias_attr=ParamAttr(initializer=Constant(0)))

            self.align_conv = paddle.vision.ops.DeformConv2D(
                self.feat_out,
                self.feat_out,
                self.align_conv_size,
                padding=(self.align_conv_size - 1) // 2,
                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                bias_attr=False)

        self.or_conv = nn.Conv2D(
            self.feat_out,
            self.feat_out,
            kernel_size=3,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
            bias_attr=ParamAttr(initializer=Constant(0)))

        # ODM
        self.odm_cls_convs = nn.Sequential()
        self.odm_reg_convs = nn.Sequential()

        for i in range(self.stacked_convs):
            ch_in = self.feat_out
            # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out

            self.odm_cls_convs.add_sublayer(
                'odm_cls_conv_{}'.format(i),
                nn.Conv2D(
                    in_channels=ch_in,
                    out_channels=self.feat_out,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                    bias_attr=ParamAttr(initializer=Constant(0))))

            self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i),
                                            nn.ReLU())

            self.odm_reg_convs.add_sublayer(
                'odm_reg_conv_{}'.format(i),
                nn.Conv2D(
                    in_channels=self.feat_out,
                    out_channels=self.feat_out,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
                    bias_attr=ParamAttr(initializer=Constant(0))))

            self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i),
                                            nn.ReLU())

        self.odm_cls = nn.Conv2D(
            self.feat_out,
            self.cls_out_channels,
            3,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
            bias_attr=ParamAttr(initializer=Constant(bias_init)))
        self.odm_reg = nn.Conv2D(
            self.feat_out,
            5,
            3,
            padding=1,
            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
            bias_attr=ParamAttr(initializer=Constant(0)))

    def forward(self, feats, targets=None):
        fam_reg_list, fam_cls_list = [], []
        odm_reg_list, odm_cls_list = [], []
        num_anchors_list, base_anchors_list, refine_anchors_list = [], [], []

        for i, feat in enumerate(feats):
            # get shape
            B = feat.shape[0]
            H, W = feat.shape[2], feat.shape[3]

            NA = H * W
            num_anchors_list.append(NA)

            fam_cls_feat = self.fam_cls_convs(feat)
            fam_cls = self.fam_cls(fam_cls_feat)
            # [N, CLS, H, W] --> [N, H, W, CLS]
            fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape(
                [B, NA, self.cls_out_channels])
            fam_cls_list.append(fam_cls)

            fam_reg_feat = self.fam_reg_convs(feat)
            fam_reg = self.fam_reg(fam_reg_feat)
            # [N, 5, H, W] --> [N, H, W, 5]
            fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
            fam_reg_list.append(fam_reg)

            # prepare anchor
            init_anchors = self.anchor_generators[i]((H, W),
                                                     self.anchor_strides[i])
            init_anchors = init_anchors.reshape([1, NA, 5])
            base_anchors_list.append(init_anchors.squeeze(0))

            if self.training:
                refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors)
            else:
                refine_anchor = self.bbox_decode(fam_reg, init_anchors)

            refine_anchors_list.append(refine_anchor)

            if self.align_conv_type == 'AlignConv':
                align_feat = self.align_conv(feat,
                                             refine_anchor.clone(), (H, W),
                                             self.anchor_strides[i])
            elif self.align_conv_type == 'DCN':
                align_offset = self.align_conv_offset(feat)
                align_feat = self.align_conv(feat, align_offset)
            elif self.align_conv_type == 'Conv':
                align_feat = self.align_conv(feat)

            or_feat = self.or_conv(align_feat)
            odm_reg_feat = or_feat
            odm_cls_feat = or_feat

            odm_reg_feat = self.odm_reg_convs(odm_reg_feat)
            odm_cls_feat = self.odm_cls_convs(odm_cls_feat)

            odm_cls = self.odm_cls(odm_cls_feat)
            # [N, CLS, H, W] --> [N, H, W, CLS]
            odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape(
                [B, NA, self.cls_out_channels])
            odm_cls_list.append(odm_cls)

            odm_reg = self.odm_reg(odm_reg_feat)
            # [N, 5, H, W] --> [N, H, W, 5]
            odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
            odm_reg_list.append(odm_reg)

        if self.training:
            return self.get_loss([
                fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list,
                num_anchors_list, base_anchors_list, refine_anchors_list
            ], targets)
        else:
            odm_bboxes_list = []
            for odm_reg, refine_anchor in zip(odm_reg_list,
                                              refine_anchors_list):
                odm_bboxes = self.bbox_decode(odm_reg, refine_anchor)
                odm_bboxes_list.append(odm_bboxes)
            return [odm_bboxes_list, odm_cls_list]

    def get_bboxes(self, head_outs):
        perd_bboxes_list, pred_scores_list = head_outs
        batch = pred_scores_list[0].shape[0]
        bboxes, bbox_num = [], []
        for i in range(batch):
            pred_scores_per_image = [t[i] for t in pred_scores_list]
            pred_bboxes_per_image = [t[i] for t in perd_bboxes_list]
            bbox_per_image, bbox_num_per_image = self.get_bboxes_single(
                pred_scores_per_image, pred_bboxes_per_image)
            bboxes.append(bbox_per_image)
            bbox_num.append(bbox_num_per_image)

        bboxes = paddle.concat(bboxes)
        bbox_num = paddle.concat(bbox_num)
        return bboxes, bbox_num

    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
        """
        Rescale, clip and filter the bbox from the output of NMS to
        get final prediction.
        Args:
            bboxes(Tensor): bboxes [N, 10]
            bbox_num(Tensor): bbox_num
            im_shape(Tensor): [1 2]
            scale_factor(Tensor): [1 2]
        Returns:
            bbox_pred(Tensor): The output is the prediction with shape [N, 8]
                               including labels, scores and bboxes. The size of
                               bboxes are corresponding to the original image.
        """
        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)

        origin_shape_list = []
        scale_factor_list = []
        # scale_factor: scale_y, scale_x
        for i in range(bbox_num.shape[0]):
            expand_shape = paddle.expand(origin_shape[i:i + 1, :],
                                         [bbox_num[i], 2])
            scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2]
            scale = paddle.concat([
                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
                scale_y
            ])
            expand_scale = paddle.expand(scale, [bbox_num[i], 8])
            origin_shape_list.append(expand_shape)
            scale_factor_list.append(expand_scale)

        origin_shape_list = paddle.concat(origin_shape_list)
        scale_factor_list = paddle.concat(scale_factor_list)

        # bboxes: [N, 10], label, score, bbox
        pred_label_score = bboxes[:, 0:2]
        pred_bbox = bboxes[:, 2:]

        # rescale bbox to original image
        pred_bbox = pred_bbox.reshape([-1, 8])
        scaled_bbox = pred_bbox / scale_factor_list
        origin_h = origin_shape_list[:, 0]
        origin_w = origin_shape_list[:, 1]

        bboxes = scaled_bbox
        zeros = paddle.zeros_like(origin_h)
        x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros)
        y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros)
        x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros)
        y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros)
        x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros)
        y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros)
        x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros)
        y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros)
        pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1)
        pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1)
        return pred_result

    def get_bboxes_single(self, cls_score_list, bbox_pred_list):
        mlvl_bboxes = []
        mlvl_scores = []

        for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list):
            if self.use_sigmoid_cls:
                scores = F.sigmoid(cls_score)
            else:
                scores = F.softmax(cls_score, axis=-1)

            if scores.shape[0] > self.nms_pre:
                # Get maximum scores for foreground classes.
                if self.use_sigmoid_cls:
                    max_scores = paddle.max(scores, axis=1)
                else:
                    max_scores = paddle.max(scores[:, :-1], axis=1)

                topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre)
                bbox_pred = paddle.gather(bbox_pred, topk_inds)
                scores = paddle.gather(scores, topk_inds)

            mlvl_bboxes.append(bbox_pred)
            mlvl_scores.append(scores)

        mlvl_bboxes = paddle.concat(mlvl_bboxes)
        mlvl_scores = paddle.concat(mlvl_scores)

        mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0)
        mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0)

        bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores)
        if bbox.shape[0] <= 0:
            bbox = self.fake_bbox
            bbox_num = self.fake_bbox_num

        return bbox, bbox_num

    def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0):
        """
        Args:
            pred: pred score
            label: label
            delta: delta
        Returns: loss
        """
        assert pred.shape == label.shape and label.numel() > 0
        assert delta > 0
        diff = paddle.abs(pred - label)
        loss = paddle.where(diff < delta, 0.5 * diff * diff / delta,
                            diff - 0.5 * delta)
        return loss

    def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'):
        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
         pos_inds, neg_inds) = fam_target
        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out

        fam_cls_losses = []
        fam_bbox_losses = []
        st_idx = 0
        num_total_samples = len(pos_inds) + len(
            neg_inds) if self.sampling else len(pos_inds)
        num_total_samples = max(1, num_total_samples)

        for idx, feat_anchor_num in enumerate(num_anchors_list):
            # step1:  get data
            feat_labels = labels[st_idx:st_idx + feat_anchor_num]
            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]

            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]

            # step2: calc cls loss
            feat_labels = feat_labels.reshape(-1)
            feat_label_weights = feat_label_weights.reshape(-1)

            fam_cls_score = fam_cls_branch_list[idx]
            fam_cls_score = paddle.squeeze(fam_cls_score, axis=0)
            fam_cls_score1 = fam_cls_score

            feat_labels = paddle.to_tensor(feat_labels)
            feat_labels_one_hot = paddle.nn.functional.one_hot(
                feat_labels, self.cls_out_channels + 1)
            feat_labels_one_hot = feat_labels_one_hot[:, 1:]
            feat_labels_one_hot.stop_gradient = True

            num_total_samples = paddle.to_tensor(
                num_total_samples, dtype='float32', stop_gradient=True)

            fam_cls = F.sigmoid_focal_loss(
                fam_cls_score1,
                feat_labels_one_hot,
                normalizer=num_total_samples,
                reduction='none')

            feat_label_weights = feat_label_weights.reshape(
                feat_label_weights.shape[0], 1)
            feat_label_weights = np.repeat(
                feat_label_weights, self.cls_out_channels, axis=1)
            feat_label_weights = paddle.to_tensor(
                feat_label_weights, stop_gradient=True)

            fam_cls = fam_cls * feat_label_weights
            fam_cls_total = paddle.sum(fam_cls)
            fam_cls_losses.append(fam_cls_total)

            # step3: regression loss
            feat_bbox_targets = paddle.to_tensor(
                feat_bbox_targets, dtype='float32', stop_gradient=True)
            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])

            fam_bbox_pred = fam_reg_branch_list[idx]
            fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0)
            fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5])
            fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets)
            loss_weight = paddle.to_tensor(
                self.reg_loss_weight, dtype='float32', stop_gradient=True)
            fam_bbox = paddle.multiply(fam_bbox, loss_weight)
            feat_bbox_weights = paddle.to_tensor(
                feat_bbox_weights, stop_gradient=True)

            fam_bbox = fam_bbox * feat_bbox_weights
            fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples
            fam_bbox_losses.append(fam_bbox_total)
            st_idx += feat_anchor_num

        fam_cls_loss = paddle.add_n(fam_cls_losses)
        fam_cls_loss_weight = paddle.to_tensor(
            self.cls_loss_weight[0], dtype='float32', stop_gradient=True)
        fam_cls_loss = fam_cls_loss * fam_cls_loss_weight
        fam_reg_loss = paddle.add_n(fam_bbox_losses)
        return fam_cls_loss, fam_reg_loss

    def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'):
        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
         pos_inds, neg_inds) = odm_target
        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out

        odm_cls_losses = []
        odm_bbox_losses = []
        st_idx = 0
        num_total_samples = len(pos_inds) + len(
            neg_inds) if self.sampling else len(pos_inds)
        num_total_samples = max(1, num_total_samples)

        for idx, feat_anchor_num in enumerate(num_anchors_list):
            # step1:  get data
            feat_labels = labels[st_idx:st_idx + feat_anchor_num]
            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]

            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]

            # step2: calc cls loss
            feat_labels = feat_labels.reshape(-1)
            feat_label_weights = feat_label_weights.reshape(-1)

            odm_cls_score = odm_cls_branch_list[idx]
            odm_cls_score = paddle.squeeze(odm_cls_score, axis=0)
            odm_cls_score1 = odm_cls_score

            feat_labels = paddle.to_tensor(feat_labels)
            feat_labels_one_hot = paddle.nn.functional.one_hot(
                feat_labels, self.cls_out_channels + 1)
            feat_labels_one_hot = feat_labels_one_hot[:, 1:]
            feat_labels_one_hot.stop_gradient = True

            num_total_samples = paddle.to_tensor(
                num_total_samples, dtype='float32', stop_gradient=True)
            odm_cls = F.sigmoid_focal_loss(
                odm_cls_score1,
                feat_labels_one_hot,
                normalizer=num_total_samples,
                reduction='none')

            feat_label_weights = feat_label_weights.reshape(
                feat_label_weights.shape[0], 1)
            feat_label_weights = np.repeat(
                feat_label_weights, self.cls_out_channels, axis=1)
            feat_label_weights = paddle.to_tensor(feat_label_weights)
            feat_label_weights.stop_gradient = True

            odm_cls = odm_cls * feat_label_weights
            odm_cls_total = paddle.sum(odm_cls)
            odm_cls_losses.append(odm_cls_total)

            # # step3: regression loss
            feat_bbox_targets = paddle.to_tensor(
                feat_bbox_targets, dtype='float32')
            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])
            feat_bbox_targets.stop_gradient = True

            odm_bbox_pred = odm_reg_branch_list[idx]
            odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0)
            odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5])
            odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets)

            loss_weight = paddle.to_tensor(
                self.reg_loss_weight, dtype='float32', stop_gradient=True)
            odm_bbox = paddle.multiply(odm_bbox, loss_weight)
            feat_bbox_weights = paddle.to_tensor(
                feat_bbox_weights, stop_gradient=True)

            odm_bbox = odm_bbox * feat_bbox_weights
            odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples

            odm_bbox_losses.append(odm_bbox_total)
            st_idx += feat_anchor_num

        odm_cls_loss = paddle.add_n(odm_cls_losses)
        odm_cls_loss_weight = paddle.to_tensor(
            self.cls_loss_weight[1], dtype='float32', stop_gradient=True)
        odm_cls_loss = odm_cls_loss * odm_cls_loss_weight
        odm_reg_loss = paddle.add_n(odm_bbox_losses)
        return odm_cls_loss, odm_reg_loss

    def get_loss(self, head_outs, inputs):
        fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \
            num_anchors_list, base_anchors_list, refine_anchors_list = head_outs

        # compute loss
        fam_cls_loss_lst = []
        fam_reg_loss_lst = []
        odm_cls_loss_lst = []
        odm_reg_loss_lst = []

        batch = len(inputs['gt_rbox'])
        for i in range(batch):
            # data_format: (xc, yc, w, h, theta)
            gt_mask = inputs['pad_gt_mask'][i, :, 0]
            gt_idx = paddle.nonzero(gt_mask).squeeze(-1)
            gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy()
            gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy()
            is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy()
            gt_labels = gt_labels + 1

            anchors_per_image = np.concatenate(base_anchors_list)

            fam_cls_per_image = [t[i] for t in fam_cls_list]
            fam_reg_per_image = [t[i] for t in fam_reg_list]
            odm_cls_per_image = [t[i] for t in odm_cls_list]
            odm_reg_per_image = [t[i] for t in odm_reg_list]
            im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image,
                                  odm_cls_per_image, odm_reg_per_image,
                                  num_anchors_list)
            # FAM
            im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes,
                                               gt_labels, is_crowd)
            if im_fam_target is not None:
                im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss(
                    im_fam_target, im_s2anet_head_out, self.reg_loss_type)
                fam_cls_loss_lst.append(im_fam_cls_loss)
                fam_reg_loss_lst.append(im_fam_reg_loss)

            # ODM
            refine_anchors_per_image = [t[i] for t in refine_anchors_list]
            refine_anchors_per_image = paddle.concat(
                refine_anchors_per_image).numpy()
            im_odm_target = self.anchor_assign(refine_anchors_per_image,
                                               gt_bboxes, gt_labels, is_crowd)

            if im_odm_target is not None:
                im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss(
                    im_odm_target, im_s2anet_head_out, self.reg_loss_type)
                odm_cls_loss_lst.append(im_odm_cls_loss)
                odm_reg_loss_lst.append(im_odm_reg_loss)

        fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch
        fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch
        odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch
        odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch
        loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss

        return {
            'loss': loss,
            'fam_cls_loss': fam_cls_loss,
            'fam_reg_loss': fam_reg_loss,
            'odm_cls_loss': odm_cls_loss,
            'odm_reg_loss': odm_reg_loss
        }

    def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6):
        """decode bbox from deltas
        Args:
            preds: [B, L, 5]
            anchors: [1, L, 5]
        return:
            bboxes: [B, L, 5]
        """
        preds = paddle.add(paddle.multiply(preds, self.stds), self.means)

        dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1)
        max_ratio = np.abs(np.log(wh_ratio_clip))
        dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
        dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)

        rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split(
            anchors, 5, axis=-1)

        gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin(
            rroi_angle) + rroi_x
        gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos(
            rroi_angle) + rroi_y
        gw = rroi_w * dw.exp()
        gh = rroi_h * dh.exp()
        ga = np.pi * dangle + rroi_angle
        ga = (ga + np.pi / 4) % np.pi - np.pi / 4
        bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1)
        return bboxes

    def rbox2poly(self, rboxes):
        """
        rboxes: [x_ctr,y_ctr,w,h,angle]
        to
        polys: [x0,y0,x1,y1,x2,y2,x3,y3]
        """
        N = rboxes.shape[0]

        x_ctr = rboxes[:, 0]
        y_ctr = rboxes[:, 1]
        width = rboxes[:, 2]
        height = rboxes[:, 3]
        angle = rboxes[:, 4]

        tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5

        normal_rects = paddle.stack(
            [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0)
        normal_rects = paddle.reshape(normal_rects, [2, 4, N])
        normal_rects = paddle.transpose(normal_rects, [2, 0, 1])

        sin, cos = paddle.sin(angle), paddle.cos(angle)
        # M: [N,2,2]
        M = paddle.stack([cos, -sin, sin, cos], axis=0)
        M = paddle.reshape(M, [2, 2, N])
        M = paddle.transpose(M, [2, 0, 1])

        # polys: [N,8]
        polys = paddle.matmul(M, normal_rects)
        polys = paddle.transpose(polys, [2, 1, 0])
        polys = paddle.reshape(polys, [-1, N])
        polys = paddle.transpose(polys, [1, 0])

        tmp = paddle.stack(
            [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1)
        polys = polys + tmp
        return polys


================================================
FILE: ppdet/modeling/heads/simota_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
from functools import partial
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant

from ppdet.core.workspace import register

from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance
from ppdet.data.transform.atss_assigner import bbox_overlaps

from .gfl_head import GFLHead


@register
class OTAHead(GFLHead):
    """
    OTAHead
    Args:
        conv_feat (object): Instance of 'FCOSFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_qfl (object): Instance of QualityFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        assigner (object): Instance of label assigner.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 16.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'assigner', 'nms'
    ]
    __shared__ = ['num_classes']

    def __init__(self,
                 conv_feat='FCOSFeat',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 loss_class='QualityFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 assigner='SimOTAAssigner',
                 reg_max=16,
                 feat_in_chan=256,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0):
        super(OTAHead, self).__init__(
            conv_feat=conv_feat,
            dgqp_module=dgqp_module,
            num_classes=num_classes,
            fpn_stride=fpn_stride,
            prior_prob=prior_prob,
            loss_class=loss_class,
            loss_dfl=loss_dfl,
            loss_bbox=loss_bbox,
            reg_max=reg_max,
            feat_in_chan=feat_in_chan,
            nms=nms,
            nms_pre=nms_pre,
            cell_offset=cell_offset)
        self.conv_feat = conv_feat
        self.dgqp_module = dgqp_module
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.loss_qfl = loss_class
        self.loss_dfl = loss_dfl
        self.loss_bbox = loss_bbox
        self.reg_max = reg_max
        self.feat_in_chan = feat_in_chan
        self.nms = nms
        self.nms_pre = nms_pre
        self.cell_offset = cell_offset
        self.use_sigmoid = self.loss_qfl.use_sigmoid

        self.assigner = assigner

    def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride,
                           flatten_bbox, gt_bboxes, gt_labels):
        """Compute targets for priors in a single image.
        """
        pos_num, label, label_weight, bbox_target = self.assigner(
            F.sigmoid(flatten_cls_pred), flatten_center_and_stride,
            flatten_bbox, gt_bboxes, gt_labels)

        return (pos_num, label, label_weight, bbox_target)

    def get_loss(self, head_outs, gt_meta):
        cls_scores, bbox_preds = head_outs
        num_level_anchors = [
            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
        ]
        num_imgs = gt_meta['im_id'].shape[0]
        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
                         for featmap in cls_scores]

        decode_bbox_preds = []
        center_and_strides = []
        for featmap_size, stride, bbox_pred in zip(featmap_sizes,
                                                   self.fpn_stride, bbox_preds):

            # center in origin image
            yy, xx = self.get_single_level_center_point(featmap_size, stride,
                                                        self.cell_offset)

            center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile(
                [num_imgs, 1, 1])
            center_and_strides.append(center_and_stride)
            center_in_feature = center_and_stride.reshape(
                [-1, 4])[:, :-2] / stride
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [num_imgs, -1, 4 * (self.reg_max + 1)])
            pred_distances = self.distribution_project(bbox_pred)
            decode_bbox_pred_wo_stride = distance2bbox(
                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)

        flatten_cls_preds = [
            cls_pred.transpose([0, 2, 3, 1]).reshape(
                [num_imgs, -1, self.cls_out_channels])
            for cls_pred in cls_scores
        ]
        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)

        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
        for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \
            in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \
                   flatten_bboxes.detach(),gt_boxes, gt_labels):
            pos_num, label, label_weight, bbox_target = self._get_target_single(
                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
                gt_box, gt_label)
            pos_num_l.append(pos_num)
            label_l.append(label)
            label_weight_l.append(label_weight)
            bbox_target_l.append(bbox_target)

        labels = paddle.to_tensor(np.stack(label_l, axis=0))
        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))

        center_and_strides_list = self._images_to_levels(
            flatten_center_and_strides, num_level_anchors)
        labels_list = self._images_to_levels(labels, num_level_anchors)
        label_weights_list = self._images_to_levels(label_weights,
                                                    num_level_anchors)
        bbox_targets_list = self._images_to_levels(bbox_targets,
                                                   num_level_anchors)
        num_total_pos = sum(pos_num_l)
        try:
            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
            num_total_pos = paddle.clip(
                num_total_pos / paddle.distributed.get_world_size(), min=1.)
        except:
            num_total_pos = max(num_total_pos, 1)

        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
                cls_scores, bbox_preds, center_and_strides_list, labels_list,
                label_weights_list, bbox_targets_list, self.fpn_stride):
            center_and_strides = center_and_strides.reshape([-1, 4])
            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
                [-1, self.cls_out_channels])
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [-1, 4 * (self.reg_max + 1)])
            bbox_targets = bbox_targets.reshape([-1, 4])
            labels = labels.reshape([-1])
            label_weights = label_weights.reshape([-1])

            bg_class_ind = self.num_classes
            pos_inds = paddle.nonzero(
                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
                as_tuple=False).squeeze(1)
            score = np.zeros(labels.shape)

            if len(pos_inds) > 0:
                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
                pos_centers = paddle.gather(
                    center_and_strides[:, :-2], pos_inds, axis=0) / stride

                weight_targets = F.sigmoid(cls_score.detach())
                weight_targets = paddle.gather(
                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
                pos_decode_bbox_pred = distance2bbox(pos_centers,
                                                     pos_bbox_pred_corners)
                pos_decode_bbox_targets = pos_bbox_targets / stride
                bbox_iou = bbox_overlaps(
                    pos_decode_bbox_pred.detach().numpy(),
                    pos_decode_bbox_targets.detach().numpy(),
                    is_aligned=True)
                score[pos_inds.numpy()] = bbox_iou

                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
                target_corners = bbox2distance(pos_centers,
                                               pos_decode_bbox_targets,
                                               self.reg_max).reshape([-1])
                # regression loss
                loss_bbox = paddle.sum(
                    self.loss_bbox(pos_decode_bbox_pred,
                                   pos_decode_bbox_targets) * weight_targets)

                # dfl loss
                loss_dfl = self.loss_dfl(
                    pred_corners,
                    target_corners,
                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=4.0)
            else:
                loss_bbox = bbox_pred.sum() * 0
                loss_dfl = bbox_pred.sum() * 0
                weight_targets = paddle.to_tensor([0], dtype='float32')

            # qfl loss
            score = paddle.to_tensor(score)
            loss_qfl = self.loss_qfl(
                cls_score, (labels, score),
                weight=label_weights,
                avg_factor=num_total_pos)
            loss_bbox_list.append(loss_bbox)
            loss_dfl_list.append(loss_dfl)
            loss_qfl_list.append(loss_qfl)
            avg_factor.append(weight_targets.sum())

        avg_factor = sum(avg_factor)
        try:
            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
            avg_factor = paddle.clip(
                avg_factor / paddle.distributed.get_world_size(), min=1)
        except:
            avg_factor = max(avg_factor.item(), 1)
        if avg_factor <= 0:
            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_bbox = paddle.to_tensor(
                0, dtype='float32', stop_gradient=False)
            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
        else:
            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
            loss_qfl = sum(loss_qfl_list)
            loss_bbox = sum(losses_bbox)
            loss_dfl = sum(losses_dfl)

        loss_states = dict(
            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)

        return loss_states


@register
class OTAVFLHead(OTAHead):
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'assigner', 'nms'
    ]
    __shared__ = ['num_classes']

    def __init__(self,
                 conv_feat='FCOSFeat',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32, 64, 128],
                 prior_prob=0.01,
                 loss_class='VarifocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 assigner='SimOTAAssigner',
                 reg_max=16,
                 feat_in_chan=256,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0):
        super(OTAVFLHead, self).__init__(
            conv_feat=conv_feat,
            dgqp_module=dgqp_module,
            num_classes=num_classes,
            fpn_stride=fpn_stride,
            prior_prob=prior_prob,
            loss_class=loss_class,
            loss_dfl=loss_dfl,
            loss_bbox=loss_bbox,
            reg_max=reg_max,
            feat_in_chan=feat_in_chan,
            nms=nms,
            nms_pre=nms_pre,
            cell_offset=cell_offset)
        self.conv_feat = conv_feat
        self.dgqp_module = dgqp_module
        self.num_classes = num_classes
        self.fpn_stride = fpn_stride
        self.prior_prob = prior_prob
        self.loss_vfl = loss_class
        self.loss_dfl = loss_dfl
        self.loss_bbox = loss_bbox
        self.reg_max = reg_max
        self.feat_in_chan = feat_in_chan
        self.nms = nms
        self.nms_pre = nms_pre
        self.cell_offset = cell_offset
        self.use_sigmoid = self.loss_vfl.use_sigmoid

        self.assigner = assigner

    def get_loss(self, head_outs, gt_meta):
        cls_scores, bbox_preds = head_outs
        num_level_anchors = [
            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
        ]
        num_imgs = gt_meta['im_id'].shape[0]
        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
                         for featmap in cls_scores]

        decode_bbox_preds = []
        center_and_strides = []
        for featmap_size, stride, bbox_pred in zip(featmap_sizes,
                                                   self.fpn_stride, bbox_preds):
            # center in origin image
            yy, xx = self.get_single_level_center_point(featmap_size, stride,
                                                        self.cell_offset)
            strides = paddle.full((len(xx), ), stride)
            center_and_stride = paddle.stack([xx, yy, strides, strides],
                                             -1).tile([num_imgs, 1, 1])
            center_and_strides.append(center_and_stride)
            center_in_feature = center_and_stride.reshape(
                [-1, 4])[:, :-2] / stride
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [num_imgs, -1, 4 * (self.reg_max + 1)])
            pred_distances = self.distribution_project(bbox_pred)
            decode_bbox_pred_wo_stride = distance2bbox(
                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)

        flatten_cls_preds = [
            cls_pred.transpose([0, 2, 3, 1]).reshape(
                [num_imgs, -1, self.cls_out_channels])
            for cls_pred in cls_scores
        ]
        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)

        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
        for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \
                in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \
                       flatten_bboxes.detach(),gt_boxes,gt_labels):
            pos_num, label, label_weight, bbox_target = self._get_target_single(
                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
                gt_box, gt_label)
            pos_num_l.append(pos_num)
            label_l.append(label)
            label_weight_l.append(label_weight)
            bbox_target_l.append(bbox_target)

        labels = paddle.to_tensor(np.stack(label_l, axis=0))
        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))

        center_and_strides_list = self._images_to_levels(
            flatten_center_and_strides, num_level_anchors)
        labels_list = self._images_to_levels(labels, num_level_anchors)
        label_weights_list = self._images_to_levels(label_weights,
                                                    num_level_anchors)
        bbox_targets_list = self._images_to_levels(bbox_targets,
                                                   num_level_anchors)
        num_total_pos = sum(pos_num_l)
        try:
            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
            num_total_pos = paddle.clip(
                num_total_pos / paddle.distributed.get_world_size(), min=1.)
        except:
            num_total_pos = max(num_total_pos, 1)

        loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], []
        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
                cls_scores, bbox_preds, center_and_strides_list, labels_list,
                label_weights_list, bbox_targets_list, self.fpn_stride):
            center_and_strides = center_and_strides.reshape([-1, 4])
            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
                [-1, self.cls_out_channels])
            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
                [-1, 4 * (self.reg_max + 1)])
            bbox_targets = bbox_targets.reshape([-1, 4])
            labels = labels.reshape([-1])

            bg_class_ind = self.num_classes
            pos_inds = paddle.nonzero(
                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
                as_tuple=False).squeeze(1)
            # vfl
            vfl_score = np.zeros(cls_score.shape)

            if len(pos_inds) > 0:
                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
                pos_centers = paddle.gather(
                    center_and_strides[:, :-2], pos_inds, axis=0) / stride

                weight_targets = F.sigmoid(cls_score.detach())
                weight_targets = paddle.gather(
                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
                pos_decode_bbox_pred = distance2bbox(pos_centers,
                                                     pos_bbox_pred_corners)
                pos_decode_bbox_targets = pos_bbox_targets / stride
                bbox_iou = bbox_overlaps(
                    pos_decode_bbox_pred.detach().numpy(),
                    pos_decode_bbox_targets.detach().numpy(),
                    is_aligned=True)

                # vfl
                pos_labels = paddle.gather(labels, pos_inds, axis=0)
                vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou

                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
                target_corners = bbox2distance(pos_centers,
                                               pos_decode_bbox_targets,
                                               self.reg_max).reshape([-1])
                # regression loss
                loss_bbox = paddle.sum(
                    self.loss_bbox(pos_decode_bbox_pred,
                                   pos_decode_bbox_targets) * weight_targets)

                # dfl loss
                loss_dfl = self.loss_dfl(
                    pred_corners,
                    target_corners,
                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
                    avg_factor=4.0)
            else:
                loss_bbox = bbox_pred.sum() * 0
                loss_dfl = bbox_pred.sum() * 0
                weight_targets = paddle.to_tensor([0], dtype='float32')

            # vfl loss
            num_pos_avg_per_gpu = num_total_pos
            vfl_score = paddle.to_tensor(vfl_score)
            loss_vfl = self.loss_vfl(
                cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu)

            loss_bbox_list.append(loss_bbox)
            loss_dfl_list.append(loss_dfl)
            loss_vfl_list.append(loss_vfl)
            avg_factor.append(weight_targets.sum())

        avg_factor = sum(avg_factor)
        try:
            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
            avg_factor = paddle.clip(
                avg_factor / paddle.distributed.get_world_size(), min=1)
        except:
            avg_factor = max(avg_factor.item(), 1)
        if avg_factor <= 0:
            loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
            loss_bbox = paddle.to_tensor(
                0, dtype='float32', stop_gradient=False)
            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
        else:
            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
            loss_vfl = sum(loss_vfl_list)
            loss_bbox = sum(losses_bbox)
            loss_dfl = sum(losses_dfl)

        loss_states = dict(
            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)

        return loss_states


================================================
FILE: ppdet/modeling/heads/solov2_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant

from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock
from ppdet.core.workspace import register

from six.moves import zip
import numpy as np

__all__ = ['SOLOv2Head']


@register
class SOLOv2MaskHead(nn.Layer):
    """
    MaskHead of SOLOv2.
    The code of this function is based on:
        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py

    Args:
        in_channels (int): The channel number of input Tensor.
        out_channels (int): The channel number of output Tensor.
        start_level (int): The position where the input starts.
        end_level (int): The position where the input ends.
        use_dcn_in_tower (bool): Whether to use dcn in tower or not.
    """
    __shared__ = ['norm_type']

    def __init__(self,
                 in_channels=256,
                 mid_channels=128,
                 out_channels=256,
                 start_level=0,
                 end_level=3,
                 use_dcn_in_tower=False,
                 norm_type='gn'):
        super(SOLOv2MaskHead, self).__init__()
        assert start_level >= 0 and end_level >= start_level
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mid_channels = mid_channels
        self.use_dcn_in_tower = use_dcn_in_tower
        self.range_level = end_level - start_level + 1
        self.use_dcn = True if self.use_dcn_in_tower else False
        self.convs_all_levels = []
        self.norm_type = norm_type
        for i in range(start_level, end_level + 1):
            conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i)
            conv_pre_feat = nn.Sequential()
            if i == start_level:
                conv_pre_feat.add_sublayer(
                    conv_feat_name + '.conv' + str(i),
                    ConvNormLayer(
                        ch_in=self.in_channels,
                        ch_out=self.mid_channels,
                        filter_size=3,
                        stride=1,
                        use_dcn=self.use_dcn,
                        norm_type=self.norm_type))
                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
                self.convs_all_levels.append(conv_pre_feat)
            else:
                for j in range(i):
                    ch_in = 0
                    if j == 0:
                        ch_in = self.in_channels + 2 if i == end_level else self.in_channels
                    else:
                        ch_in = self.mid_channels
                    conv_pre_feat.add_sublayer(
                        conv_feat_name + '.conv' + str(j),
                        ConvNormLayer(
                            ch_in=ch_in,
                            ch_out=self.mid_channels,
                            filter_size=3,
                            stride=1,
                            use_dcn=self.use_dcn,
                            norm_type=self.norm_type))
                    conv_pre_feat.add_sublayer(
                        conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU())
                    conv_pre_feat.add_sublayer(
                        'upsample' + str(i) + str(j),
                        nn.Upsample(
                            scale_factor=2, mode='bilinear'))
                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
                self.convs_all_levels.append(conv_pre_feat)

        conv_pred_name = 'mask_feat_head.conv_pred.0'
        self.conv_pred = self.add_sublayer(
            conv_pred_name,
            ConvNormLayer(
                ch_in=self.mid_channels,
                ch_out=self.out_channels,
                filter_size=1,
                stride=1,
                use_dcn=self.use_dcn,
                norm_type=self.norm_type))

    def forward(self, inputs):
        """
        Get SOLOv2MaskHead output.

        Args:
            inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W]
        Returns:
            ins_pred(Tensor): Output of SOLOv2MaskHead head
        """
        feat_all_level = F.relu(self.convs_all_levels[0](inputs[0]))
        for i in range(1, self.range_level):
            input_p = inputs[i]
            if i == (self.range_level - 1):
                input_feat = input_p
                x_range = paddle.linspace(
                    -1, 1, input_feat.shape[-1], dtype='float32')
                y_range = paddle.linspace(
                    -1, 1, input_feat.shape[-2], dtype='float32')
                y, x = paddle.meshgrid([y_range, x_range])
                x = paddle.unsqueeze(x, [0, 1])
                y = paddle.unsqueeze(y, [0, 1])
                y = paddle.expand(
                    y, shape=[input_feat.shape[0], 1, -1, -1])
                x = paddle.expand(
                    x, shape=[input_feat.shape[0], 1, -1, -1])
                coord_feat = paddle.concat([x, y], axis=1)
                input_p = paddle.concat([input_p, coord_feat], axis=1)
            feat_all_level = paddle.add(feat_all_level,
                                        self.convs_all_levels[i](input_p))
        ins_pred = F.relu(self.conv_pred(feat_all_level))

        return ins_pred


@register
class SOLOv2Head(nn.Layer):
    """
    Head block for SOLOv2 network

    Args:
        num_classes (int): Number of output classes.
        in_channels (int): Number of input channels.
        seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation.
        stacked_convs (int): Times of convolution operation.
        num_grids (list[int]): List of feature map grids size.
        kernel_out_channels (int): Number of output channels in kernel branch.
        dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs).
        segm_strides (list[int]): List of segmentation area stride.
        solov2_loss (object): SOLOv2Loss instance.
        score_threshold (float): Threshold of categroy score.
        mask_nms (object): MaskMatrixNMS instance.
    """
    __inject__ = ['solov2_loss', 'mask_nms']
    __shared__ = ['norm_type', 'num_classes']

    def __init__(self,
                 num_classes=80,
                 in_channels=256,
                 seg_feat_channels=256,
                 stacked_convs=4,
                 num_grids=[40, 36, 24, 16, 12],
                 kernel_out_channels=256,
                 dcn_v2_stages=[],
                 segm_strides=[8, 8, 16, 32, 32],
                 solov2_loss=None,
                 score_threshold=0.1,
                 mask_threshold=0.5,
                 mask_nms=None,
                 norm_type='gn',
                 drop_block=False):
        super(SOLOv2Head, self).__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.seg_num_grids = num_grids
        self.cate_out_channels = self.num_classes
        self.seg_feat_channels = seg_feat_channels
        self.stacked_convs = stacked_convs
        self.kernel_out_channels = kernel_out_channels
        self.dcn_v2_stages = dcn_v2_stages
        self.segm_strides = segm_strides
        self.solov2_loss = solov2_loss
        self.mask_nms = mask_nms
        self.score_threshold = score_threshold
        self.mask_threshold = mask_threshold
        self.norm_type = norm_type
        self.drop_block = drop_block

        self.kernel_pred_convs = []
        self.cate_pred_convs = []
        for i in range(self.stacked_convs):
            use_dcn = True if i in self.dcn_v2_stages else False
            ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels
            kernel_conv = self.add_sublayer(
                'bbox_head.kernel_convs.' + str(i),
                ConvNormLayer(
                    ch_in=ch_in,
                    ch_out=self.seg_feat_channels,
                    filter_size=3,
                    stride=1,
                    use_dcn=use_dcn,
                    norm_type=self.norm_type))
            self.kernel_pred_convs.append(kernel_conv)
            ch_in = self.in_channels if i == 0 else self.seg_feat_channels
            cate_conv = self.add_sublayer(
                'bbox_head.cate_convs.' + str(i),
                ConvNormLayer(
                    ch_in=ch_in,
                    ch_out=self.seg_feat_channels,
                    filter_size=3,
                    stride=1,
                    use_dcn=use_dcn,
                    norm_type=self.norm_type))
            self.cate_pred_convs.append(cate_conv)

        self.solo_kernel = self.add_sublayer(
            'bbox_head.solo_kernel',
            nn.Conv2D(
                self.seg_feat_channels,
                self.kernel_out_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=True))
        self.solo_cate = self.add_sublayer(
            'bbox_head.solo_cate',
            nn.Conv2D(
                self.seg_feat_channels,
                self.cate_out_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(
                    value=float(-np.log((1 - 0.01) / 0.01))))))

        if self.drop_block and self.training:
            self.drop_block_fun = DropBlock(
                block_size=3, keep_prob=0.9, name='solo_cate.dropblock')

    def _points_nms(self, heat, kernel_size=2):
        hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1)
        keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32')
        return heat * keep

    def _split_feats(self, feats):
        return (F.interpolate(
            feats[0],
            scale_factor=0.5,
            align_corners=False,
            align_mode=0,
            mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate(
                feats[4],
                size=feats[3].shape[-2:],
                mode='bilinear',
                align_corners=False,
                align_mode=0))

    def forward(self, input):
        """
        Get SOLOv2 head output

        Args:
            input (list): List of Tensors, output of backbone or neck stages
        Returns:
            cate_pred_list (list): Tensors of each category branch layer
            kernel_pred_list (list): Tensors of each kernel branch layer
        """
        feats = self._split_feats(input)
        cate_pred_list = []
        kernel_pred_list = []
        for idx in range(len(self.seg_num_grids)):
            cate_pred, kernel_pred = self._get_output_single(feats[idx], idx)
            cate_pred_list.append(cate_pred)
            kernel_pred_list.append(kernel_pred)

        return cate_pred_list, kernel_pred_list

    def _get_output_single(self, input, idx):
        ins_kernel_feat = input
        # CoordConv
        x_range = paddle.linspace(
            -1, 1, ins_kernel_feat.shape[-1], dtype='float32')
        y_range = paddle.linspace(
            -1, 1, ins_kernel_feat.shape[-2], dtype='float32')
        y, x = paddle.meshgrid([y_range, x_range])
        x = paddle.unsqueeze(x, [0, 1])
        y = paddle.unsqueeze(y, [0, 1])
        y = paddle.expand(
            y, shape=[ins_kernel_feat.shape[0], 1, -1, -1])
        x = paddle.expand(
            x, shape=[ins_kernel_feat.shape[0], 1, -1, -1])
        coord_feat = paddle.concat([x, y], axis=1)
        ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1)

        # kernel branch
        kernel_feat = ins_kernel_feat
        seg_num_grid = self.seg_num_grids[idx]
        kernel_feat = F.interpolate(
            kernel_feat,
            size=[seg_num_grid, seg_num_grid],
            mode='bilinear',
            align_corners=False,
            align_mode=0)
        cate_feat = kernel_feat[:, :-2, :, :]

        for kernel_layer in self.kernel_pred_convs:
            kernel_feat = F.relu(kernel_layer(kernel_feat))
        if self.drop_block and self.training:
            kernel_feat = self.drop_block_fun(kernel_feat)
        kernel_pred = self.solo_kernel(kernel_feat)
        # cate branch
        for cate_layer in self.cate_pred_convs:
            cate_feat = F.relu(cate_layer(cate_feat))
        if self.drop_block and self.training:
            cate_feat = self.drop_block_fun(cate_feat)
        cate_pred = self.solo_cate(cate_feat)

        if not self.training:
            cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2)
            cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1])
        return cate_pred, kernel_pred

    def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels,
                 cate_labels, grid_order_list, fg_num):
        """
        Get loss of network of SOLOv2.

        Args:
            cate_preds (list): Tensor list of categroy branch output.
            kernel_preds (list): Tensor list of kernel branch output.
            ins_pred (list): Tensor list of instance branch output.
            ins_labels (list): List of instance labels pre batch.
            cate_labels (list): List of categroy labels pre batch.
            grid_order_list (list): List of index in pre grid.
            fg_num (int): Number of positive samples in a mini-batch.
        Returns:
            loss_ins (Tensor): The instance loss Tensor of SOLOv2 network.
            loss_cate (Tensor): The category loss Tensor of SOLOv2 network.
        """
        batch_size = grid_order_list[0].shape[0]
        ins_pred_list = []
        for kernel_preds_level, grid_orders_level in zip(kernel_preds,
                                                         grid_order_list):
            if grid_orders_level.shape[1] == 0:
                ins_pred_list.append(None)
                continue
            grid_orders_level = paddle.reshape(grid_orders_level, [-1])
            reshape_pred = paddle.reshape(
                kernel_preds_level,
                shape=(kernel_preds_level.shape[0],
                       kernel_preds_level.shape[1], -1))
            reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1])
            reshape_pred = paddle.reshape(
                reshape_pred, shape=(-1, reshape_pred.shape[2]))
            gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level)
            gathered_pred = paddle.reshape(
                gathered_pred,
                shape=[batch_size, -1, gathered_pred.shape[1]])
            cur_ins_pred = ins_pred
            cur_ins_pred = paddle.reshape(
                cur_ins_pred,
                shape=(cur_ins_pred.shape[0],
                       cur_ins_pred.shape[1], -1))
            ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred)
            cur_ins_pred = paddle.reshape(
                ins_pred_conv,
                shape=(-1, ins_pred.shape[-2],
                       ins_pred.shape[-1]))
            ins_pred_list.append(cur_ins_pred)

        num_ins = paddle.sum(fg_num)
        cate_preds = [
            paddle.reshape(
                paddle.transpose(cate_pred, [0, 2, 3, 1]),
                shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds
        ]
        flatten_cate_preds = paddle.concat(cate_preds)
        new_cate_labels = []
        for cate_label in cate_labels:
            new_cate_labels.append(paddle.reshape(cate_label, shape=[-1]))
        cate_labels = paddle.concat(new_cate_labels)

        loss_ins, loss_cate = self.solov2_loss(
            ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins)

        return {'loss_ins': loss_ins, 'loss_cate': loss_cate}

    def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape,
                       scale_factor):
        """
        Get prediction result of SOLOv2 network

        Args:
            cate_preds (list): List of Variables, output of categroy branch.
            kernel_preds (list): List of Variables, output of kernel branch.
            seg_pred (list): List of Variables, output of mask head stages.
            im_shape (Variables): [h, w] for input images.
            scale_factor (Variables): [scale, scale] for input images.
        Returns:
            seg_masks (Tensor): The prediction segmentation.
            cate_labels (Tensor): The prediction categroy label of each segmentation.
            seg_masks (Tensor): The prediction score of each segmentation.
        """
        num_levels = len(cate_preds)
        featmap_size = seg_pred.shape[-2:]
        seg_masks_list = []
        cate_labels_list = []
        cate_scores_list = []
        cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds]
        kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds]
        # Currently only supports batch size == 1
        for idx in range(1):
            cate_pred_list = [
                paddle.reshape(
                    cate_preds[i][idx], shape=(-1, self.cate_out_channels))
                for i in range(num_levels)
            ]
            seg_pred_list = seg_pred
            kernel_pred_list = [
                paddle.reshape(
                    paddle.transpose(kernel_preds[i][idx], [1, 2, 0]),
                    shape=(-1, self.kernel_out_channels))
                for i in range(num_levels)
            ]
            cate_pred_list = paddle.concat(cate_pred_list, axis=0)
            kernel_pred_list = paddle.concat(kernel_pred_list, axis=0)

            seg_masks, cate_labels, cate_scores = self.get_seg_single(
                cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size,
                im_shape[idx], scale_factor[idx][0])
            bbox_num = cate_labels.shape[0:1]
        return seg_masks, cate_labels, cate_scores, bbox_num

    def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size,
                       im_shape, scale_factor):
        """
        The code of this function is based on:
            https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385
        """
        h = paddle.cast(im_shape[0], 'int32')
        w = paddle.cast(im_shape[1], 'int32')
        upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4]

        y = paddle.zeros(shape=cate_preds.shape, dtype='float32')
        inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y)
        inds = paddle.nonzero(inds)
        cate_preds = paddle.reshape(cate_preds, shape=[-1])
        # Prevent empty and increase fake data
        ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64')
        ind_b = paddle.zeros(shape=[1], dtype='int64')
        inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0)
        inds = paddle.concat([inds, inds_end])
        kernel_preds_end = paddle.ones(
            shape=[1, self.kernel_out_channels], dtype='float32')
        kernel_preds = paddle.concat([kernel_preds, kernel_preds_end])
        cate_preds = paddle.concat(
            [cate_preds, paddle.zeros(
                shape=[1], dtype='float32')])

        # cate_labels & kernel_preds
        cate_labels = inds[:, 1]
        kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0])
        cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels,
                                    cate_labels)
        cate_scores = paddle.gather(cate_preds, index=cate_score_idx)

        size_trans = np.power(self.seg_num_grids, 2)
        strides = []
        for _ind in range(len(self.segm_strides)):
            strides.append(
                paddle.full(
                    shape=[int(size_trans[_ind])],
                    fill_value=self.segm_strides[_ind],
                    dtype="int32"))
        strides = paddle.concat(strides)
        strides = paddle.concat(
            [strides, paddle.zeros(
                shape=[1], dtype='int32')])
        strides = paddle.gather(strides, index=inds[:, 0])

        # mask encoding.
        kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3])
        seg_preds = F.conv2d(seg_preds, kernel_preds)
        seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0]))
        seg_masks = seg_preds > self.mask_threshold
        seg_masks = paddle.cast(seg_masks, 'float32')
        sum_masks = paddle.sum(seg_masks, axis=[1, 2])

        y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32')
        keep = paddle.where(sum_masks > strides.cast(sum_masks.dtype), sum_masks, y)
        keep = paddle.nonzero(keep)
        keep = paddle.squeeze(keep, axis=[1])
        # Prevent empty and increase fake data
        keep_other = paddle.concat(
            [keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')])
        keep_scores = paddle.concat(
            [keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')])
        cate_scores_end = paddle.zeros(shape=[1], dtype='float32')
        cate_scores = paddle.concat([cate_scores, cate_scores_end])

        seg_masks = paddle.gather(seg_masks, index=keep_other)
        seg_preds = paddle.gather(seg_preds, index=keep_other)
        sum_masks = paddle.gather(sum_masks, index=keep_other)
        cate_labels = paddle.gather(cate_labels, index=keep_other)
        cate_scores = paddle.gather(cate_scores, index=keep_scores)

        # mask scoring.
        seg_mul = paddle.cast(seg_preds * seg_masks, 'float32')
        seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks
        cate_scores *= seg_scores
        # Matrix NMS
        seg_preds, cate_scores, cate_labels = self.mask_nms(
            seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks)
        ori_shape = im_shape[:2] / scale_factor + 0.5
        ori_shape = paddle.cast(ori_shape, 'int32')
        seg_preds = F.interpolate(
            paddle.unsqueeze(seg_preds, 0),
            size=upsampled_size_out,
            mode='bilinear',
            align_corners=False,
            align_mode=0)
        seg_preds = paddle.slice(
            seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w])
        seg_masks = paddle.squeeze(
            F.interpolate(
                seg_preds,
                size=ori_shape[:2],
                mode='bilinear',
                align_corners=False,
                align_mode=0),
            axis=[0])
        seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8')
        return seg_masks, cate_labels, cate_scores


================================================
FILE: ppdet/modeling/heads/sparse_roi_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This code is referenced from: https://github.com/open-mmlab/mmdetection

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy

import paddle
from paddle import nn

from ppdet.core.workspace import register
from ppdet.modeling import initializer as init
from .roi_extractor import RoIAlign
from ..bbox_utils import delta2bbox_v2
from ..cls_utils import _get_class_default_kwargs
from ..layers import MultiHeadAttention

__all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead']


class DynamicConv(nn.Layer):
    def __init__(self,
                 in_channels=256,
                 feature_channels=64,
                 out_channels=None,
                 roi_resolution=7,
                 with_proj=True):
        super(DynamicConv, self).__init__()

        self.in_channels = in_channels
        self.feature_channels = feature_channels
        self.out_channels = out_channels if out_channels else in_channels

        self.num_params_in = self.in_channels * self.feature_channels
        self.num_params_out = self.out_channels * self.feature_channels
        self.dynamic_layer = nn.Linear(self.in_channels,
                                       self.num_params_in + self.num_params_out)

        self.norm_in = nn.LayerNorm(self.feature_channels)
        self.norm_out = nn.LayerNorm(self.out_channels)

        self.activation = nn.ReLU()

        self.with_proj = with_proj
        if self.with_proj:
            num_output = self.out_channels * roi_resolution**2
            self.fc_layer = nn.Linear(num_output, self.out_channels)
            self.fc_norm = nn.LayerNorm(self.out_channels)

    def forward(self, param_feature, input_feature):
        input_feature = input_feature.flatten(2).transpose([2, 0, 1])
        input_feature = input_feature.transpose([1, 0, 2])

        parameters = self.dynamic_layer(param_feature)

        param_in = parameters[:, :self.num_params_in].reshape(
            [-1, self.in_channels, self.feature_channels])
        param_out = parameters[:, -self.num_params_out:].reshape(
            [-1, self.feature_channels, self.out_channels])

        features = paddle.bmm(input_feature, param_in)
        features = self.norm_in(features)
        features = self.activation(features)

        features = paddle.bmm(features, param_out)
        features = self.norm_out(features)
        features = self.activation(features)

        if self.with_proj:
            features = features.flatten(1)
            features = self.fc_layer(features)
            features = self.fc_norm(features)
            features = self.activation(features)

        return features


class FFN(nn.Layer):
    def __init__(self,
                 embed_dims=256,
                 feedforward_channels=2048,
                 num_fcs=2,
                 ffn_drop=0.0,
                 add_identity=True):
        super(FFN, self).__init__()

        layers = []
        in_channels = embed_dims
        for _ in range(num_fcs - 1):
            layers.append(
                nn.Sequential(
                    nn.Linear(in_channels, feedforward_channels),
                    nn.ReLU(), nn.Dropout(ffn_drop)))
            in_channels = feedforward_channels
        layers.append(nn.Linear(feedforward_channels, embed_dims))
        layers.append(nn.Dropout(ffn_drop))
        self.layers = nn.Sequential(*layers)

        self.add_identity = add_identity

    def forward(self, x):
        identity = x
        out = self.layers(x)
        if not self.add_identity:
            return out
        else:
            return out + identity


@register
class DynamicMaskHead(nn.Layer):
    __shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type']

    def __init__(self,
                 num_classes=80,
                 proposal_embedding_dim=256,
                 dynamic_feature_channels=64,
                 roi_resolution=14,
                 num_convs=4,
                 conv_kernel_size=3,
                 conv_channels=256,
                 upsample_method='deconv',
                 upsample_scale_factor=2,
                 norm_type='bn'):
        super(DynamicMaskHead, self).__init__()

        self.d_model = proposal_embedding_dim

        self.instance_interactive_conv = DynamicConv(
            self.d_model,
            dynamic_feature_channels,
            roi_resolution=roi_resolution,
            with_proj=False)

        self.convs = nn.LayerList()
        for i in range(num_convs):
            self.convs.append(
                nn.Sequential(
                    nn.Conv2D(
                        self.d_model if i == 0 else conv_channels,
                        conv_channels,
                        conv_kernel_size,
                        padding='same',
                        bias_attr=False),
                    nn.BatchNorm2D(conv_channels),
                    nn.ReLU()))
        if norm_type == 'sync_bn':
            self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs)

        self.upsample_method = upsample_method
        if upsample_method is None:
            self.upsample = None
        elif upsample_method == 'deconv':
            self.upsample = nn.Conv2DTranspose(
                conv_channels if num_convs > 0 else self.d_model,
                conv_channels,
                upsample_scale_factor,
                stride=upsample_scale_factor)
            self.relu = nn.ReLU()
        else:
            self.upsample = nn.Upsample(None, upsample_scale_factor)

        cls_in_channels = conv_channels if num_convs > 0 else self.d_model
        cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels
        self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1)

        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                init.xavier_uniform_(p)

        init.constant_(self.conv_cls.bias, 0.)

    def forward(self, roi_features, attn_features):
        attn_features = attn_features.reshape([-1, self.d_model])
        attn_features_iic = self.instance_interactive_conv(attn_features,
                                                           roi_features)

        x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape)

        for conv in self.convs:
            x = conv(x)
        if self.upsample is not None:
            x = self.upsample(x)
            if self.upsample_method == 'deconv':
                x = self.relu(x)
        mask_pred = self.conv_cls(x)
        return mask_pred


@register
class DIIHead(nn.Layer):
    __shared__ = ['num_classes', 'proposal_embedding_dim']

    def __init__(self,
                 num_classes=80,
                 proposal_embedding_dim=256,
                 feedforward_channels=2048,
                 dynamic_feature_channels=64,
                 roi_resolution=7,
                 num_attn_heads=8,
                 dropout=0.0,
                 num_ffn_fcs=2,
                 num_cls_fcs=1,
                 num_reg_fcs=3):
        super(DIIHead, self).__init__()

        self.num_classes = num_classes
        self.d_model = proposal_embedding_dim

        self.attention = MultiHeadAttention(self.d_model, num_attn_heads,
                                            dropout)
        self.attention_norm = nn.LayerNorm(self.d_model)

        self.instance_interactive_conv = DynamicConv(
            self.d_model,
            dynamic_feature_channels,
            roi_resolution=roi_resolution,
            with_proj=True)
        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
        self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model)

        self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout)
        self.ffn_norm = nn.LayerNorm(self.d_model)

        self.cls_fcs = nn.LayerList()
        for _ in range(num_cls_fcs):
            self.cls_fcs.append(
                nn.Linear(
                    self.d_model, self.d_model, bias_attr=False))
            self.cls_fcs.append(nn.LayerNorm(self.d_model))
            self.cls_fcs.append(nn.ReLU())
        self.fc_cls = nn.Linear(self.d_model, self.num_classes)

        self.reg_fcs = nn.LayerList()
        for _ in range(num_reg_fcs):
            self.reg_fcs.append(
                nn.Linear(
                    self.d_model, self.d_model, bias_attr=False))
            self.reg_fcs.append(nn.LayerNorm(self.d_model))
            self.reg_fcs.append(nn.ReLU())
        self.fc_reg = nn.Linear(self.d_model, 4)

        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                init.xavier_uniform_(p)

        bias_init = init.bias_init_with_prob(0.01)
        init.constant_(self.fc_cls.bias, bias_init)

    def forward(self, roi_features, proposal_features):
        N, num_proposals = proposal_features.shape[:2]

        proposal_features = proposal_features + self.attention(
            proposal_features)
        attn_features = self.attention_norm(proposal_features)

        proposal_features = attn_features.reshape([-1, self.d_model])
        proposal_features_iic = self.instance_interactive_conv(
            proposal_features, roi_features)
        proposal_features = proposal_features + self.instance_interactive_conv_dropout(
            proposal_features_iic)
        obj_features = self.instance_interactive_conv_norm(proposal_features)

        obj_features = self.ffn(obj_features)
        obj_features = self.ffn_norm(obj_features)

        cls_feature = obj_features.clone()
        reg_feature = obj_features.clone()

        for cls_layer in self.cls_fcs:
            cls_feature = cls_layer(cls_feature)
        class_logits = self.fc_cls(cls_feature)
        for reg_layer in self.reg_fcs:
            reg_feature = reg_layer(reg_feature)
        bbox_deltas = self.fc_reg(reg_feature)

        class_logits = class_logits.reshape(
            [N, num_proposals, self.num_classes])
        bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4])
        obj_features = obj_features.reshape([N, num_proposals, self.d_model])

        return class_logits, bbox_deltas, obj_features, attn_features

    @staticmethod
    def refine_bboxes(proposal_bboxes, bbox_deltas):
        pred_bboxes = delta2bbox_v2(
            bbox_deltas.reshape([-1, 4]),
            proposal_bboxes.reshape([-1, 4]),
            delta_mean=[0.0, 0.0, 0.0, 0.0],
            delta_std=[0.5, 0.5, 1.0, 1.0],
            ctr_clip=None)
        return pred_bboxes.reshape(proposal_bboxes.shape)


@register
class SparseRoIHead(nn.Layer):
    __inject__ = ['bbox_head', 'mask_head', 'loss_func']

    def __init__(self,
                 num_stages=6,
                 bbox_roi_extractor=_get_class_default_kwargs(RoIAlign),
                 mask_roi_extractor=_get_class_default_kwargs(RoIAlign),
                 bbox_head='DIIHead',
                 mask_head='DynamicMaskHead',
                 loss_func='QueryInstLoss'):
        super(SparseRoIHead, self).__init__()

        self.num_stages = num_stages

        self.bbox_roi_extractor = bbox_roi_extractor
        self.mask_roi_extractor = mask_roi_extractor
        if isinstance(bbox_roi_extractor, dict):
            self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor)
        if isinstance(mask_roi_extractor, dict):
            self.mask_roi_extractor = RoIAlign(**mask_roi_extractor)

        self.bbox_heads = nn.LayerList(
            [copy.deepcopy(bbox_head) for _ in range(num_stages)])
        self.mask_heads = nn.LayerList(
            [copy.deepcopy(mask_head) for _ in range(num_stages)])

        self.loss_helper = loss_func

    @classmethod
    def from_config(cls, cfg, input_shape):
        bbox_roi_extractor = cfg['bbox_roi_extractor']
        mask_roi_extractor = cfg['mask_roi_extractor']
        assert isinstance(bbox_roi_extractor, dict)
        assert isinstance(mask_roi_extractor, dict)

        kwargs = RoIAlign.from_config(cfg, input_shape)
        bbox_roi_extractor.update(kwargs)
        mask_roi_extractor.update(kwargs)

        return {
            'bbox_roi_extractor': bbox_roi_extractor,
            'mask_roi_extractor': mask_roi_extractor
        }

    @staticmethod
    def get_roi_features(features, bboxes, roi_extractor):
        rois_list = [
            bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0
        ]
        rois_num = paddle.to_tensor(
            [len(bboxes[i]) for i in range(len(bboxes))], dtype='int32')

        pos_ids = paddle.cast(rois_num, dtype='bool')
        if pos_ids.sum() != len(rois_num):
            rois_num = rois_num[pos_ids]
            features = [features[i][pos_ids] for i in range(len(features))]

        return roi_extractor(features, rois_list, rois_num)

    def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets):
        all_stage_losses = {}
        for stage in range(self.num_stages):
            bbox_head = self.bbox_heads[stage]
            mask_head = self.mask_heads[stage]

            roi_feats = self.get_roi_features(body_feats, pro_bboxes,
                                              self.bbox_roi_extractor)
            class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head(
                roi_feats, pro_feats)
            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
                                                             bbox_deltas)

            indices = self.loss_helper.matcher({
                'pred_logits': class_logits.detach(),
                'pred_boxes': bbox_pred.detach()
            }, targets)
            avg_factor = paddle.to_tensor(
                [sum(len(tgt['labels']) for tgt in targets)], dtype='float32')
            if paddle.distributed.get_world_size() > 1:
                paddle.distributed.all_reduce(avg_factor)
                avg_factor /= paddle.distributed.get_world_size()
            avg_factor = paddle.clip(avg_factor, min=1.)

            loss_classes = self.loss_helper.loss_classes(class_logits, targets,
                                                         indices, avg_factor)
            if sum(len(v['labels']) for v in targets) == 0:
                loss_bboxes = {
                    'loss_bbox': paddle.to_tensor([0.]),
                    'loss_giou': paddle.to_tensor([0.])
                }
                loss_masks = {'loss_mask': paddle.to_tensor([0.])}
            else:
                loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets,
                                                           indices, avg_factor)

                pos_attn_feats = paddle.concat([
                    paddle.gather(
                        src, src_idx, axis=0)
                    for src, (src_idx, _) in zip(attn_feats, indices)
                ])
                pos_bbox_pred = [
                    paddle.gather(
                        src, src_idx, axis=0)
                    for src, (src_idx, _) in zip(bbox_pred.detach(), indices)
                ]
                pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred,
                                                      self.mask_roi_extractor)
                mask_logits = mask_head(pos_roi_feats, pos_attn_feats)
                loss_masks = self.loss_helper.loss_masks(
                    pos_bbox_pred, mask_logits, targets, indices, avg_factor)

            for loss in [loss_classes, loss_bboxes, loss_masks]:
                for key in loss.keys():
                    all_stage_losses[f'stage{stage}_{key}'] = loss[key]

            pro_bboxes = bbox_pred.detach()

        return all_stage_losses

    def _forward_test(self, body_feats, pro_bboxes, pro_feats):
        for stage in range(self.num_stages):
            roi_feats = self.get_roi_features(body_feats, pro_bboxes,
                                              self.bbox_roi_extractor)
            class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[
                stage](roi_feats, pro_feats)
            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
                                                             bbox_deltas)

            pro_bboxes = bbox_pred.detach()

        roi_feats = self.get_roi_features(body_feats, bbox_pred,
                                          self.mask_roi_extractor)
        mask_logits = self.mask_heads[stage](roi_feats, attn_feats)

        return {
            'class_logits': class_logits,
            'bbox_pred': bbox_pred,
            'mask_logits': mask_logits
        }

    def forward(self,
                body_features,
                proposal_bboxes,
                proposal_features,
                targets=None):
        if self.training:
            return self._forward_train(body_features, proposal_bboxes,
                                       proposal_features, targets)
        else:
            return self._forward_test(body_features, proposal_bboxes,
                                      proposal_features)


================================================
FILE: ppdet/modeling/heads/sparsercnn_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py
Ths copyright of PeizeSun/SparseR-CNN is as follows:
MIT License [see LICENSE for details]
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import copy
import paddle
import paddle.nn as nn

from ppdet.core.workspace import register
from ppdet.modeling.heads.roi_extractor import RoIAlign
from ppdet.modeling.bbox_utils import delta2bbox
from .. import initializer as init

_DEFAULT_SCALE_CLAMP = math.log(100000. / 16)


class DynamicConv(nn.Layer):
    def __init__(
            self,
            head_hidden_dim,
            head_dim_dynamic,
            head_num_dynamic, ):
        super().__init__()

        self.hidden_dim = head_hidden_dim
        self.dim_dynamic = head_dim_dynamic
        self.num_dynamic = head_num_dynamic
        self.num_params = self.hidden_dim * self.dim_dynamic
        self.dynamic_layer = nn.Linear(self.hidden_dim,
                                       self.num_dynamic * self.num_params)

        self.norm1 = nn.LayerNorm(self.dim_dynamic)
        self.norm2 = nn.LayerNorm(self.hidden_dim)

        self.activation = nn.ReLU()

        pooler_resolution = 7
        num_output = self.hidden_dim * pooler_resolution**2
        self.out_layer = nn.Linear(num_output, self.hidden_dim)
        self.norm3 = nn.LayerNorm(self.hidden_dim)

    def forward(self, pro_features, roi_features):
        '''
        pro_features: (1,  N * nr_boxes, self.d_model)
        roi_features: (49, N * nr_boxes, self.d_model)
        '''
        features = roi_features.transpose(perm=[1, 0, 2])
        parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2])

        param1 = parameters[:, :, :self.num_params].reshape(
            [-1, self.hidden_dim, self.dim_dynamic])
        param2 = parameters[:, :, self.num_params:].reshape(
            [-1, self.dim_dynamic, self.hidden_dim])

        features = paddle.bmm(features, param1)
        features = self.norm1(features)
        features = self.activation(features)

        features = paddle.bmm(features, param2)
        features = self.norm2(features)
        features = self.activation(features)

        features = features.flatten(1)
        features = self.out_layer(features)
        features = self.norm3(features)
        features = self.activation(features)

        return features


class RCNNHead(nn.Layer):
    def __init__(
            self,
            d_model,
            num_classes,
            dim_feedforward,
            nhead,
            dropout,
            head_cls,
            head_reg,
            head_dim_dynamic,
            head_num_dynamic,
            scale_clamp: float=_DEFAULT_SCALE_CLAMP,
            bbox_weights=(2.0, 2.0, 1.0, 1.0), ):
        super().__init__()

        self.d_model = d_model

        # dynamic.
        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)
        self.inst_interact = DynamicConv(d_model, head_dim_dynamic,
                                         head_num_dynamic)

        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.activation = nn.ReLU()

        # cls.
        num_cls = head_cls
        cls_module = list()
        for _ in range(num_cls):
            cls_module.append(nn.Linear(d_model, d_model, bias_attr=False))
            cls_module.append(nn.LayerNorm(d_model))
            cls_module.append(nn.ReLU())
        self.cls_module = nn.LayerList(cls_module)

        # reg.
        num_reg = head_reg
        reg_module = list()
        for _ in range(num_reg):
            reg_module.append(nn.Linear(d_model, d_model, bias_attr=False))
            reg_module.append(nn.LayerNorm(d_model))
            reg_module.append(nn.ReLU())
        self.reg_module = nn.LayerList(reg_module)

        # pred.
        self.class_logits = nn.Linear(d_model, num_classes)
        self.bboxes_delta = nn.Linear(d_model, 4)
        self.scale_clamp = scale_clamp
        self.bbox_weights = bbox_weights

    def forward(self, features, bboxes, pro_features, pooler):
        """
        :param bboxes: (N, nr_boxes, 4)
        :param pro_features: (N, nr_boxes, d_model)
        """

        N, nr_boxes = bboxes.shape[:2]

        proposal_boxes = list()
        for b in range(N):
            proposal_boxes.append(bboxes[b])
        roi_num = paddle.full([N], nr_boxes).astype("int32")

        roi_features = pooler(features, proposal_boxes, roi_num)
        roi_features = roi_features.reshape(
            [N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1])

        # self_att.
        pro_features = pro_features.reshape([N, nr_boxes, self.d_model])
        pro_features2 = self.self_attn(
            pro_features, pro_features, value=pro_features)
        pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1(
            pro_features2.transpose(perm=[1, 0, 2]))
        pro_features = self.norm1(pro_features)

        # inst_interact.
        pro_features = pro_features.reshape(
            [nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape(
                [1, N * nr_boxes, self.d_model])
        pro_features2 = self.inst_interact(pro_features, roi_features)
        pro_features = pro_features + self.dropout2(pro_features2)
        obj_features = self.norm2(pro_features)

        # obj_feature.
        obj_features2 = self.linear2(
            self.dropout(self.activation(self.linear1(obj_features))))
        obj_features = obj_features + self.dropout3(obj_features2)
        obj_features = self.norm3(obj_features)

        fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape(
            [N * nr_boxes, -1])
        cls_feature = fc_feature.clone()
        reg_feature = fc_feature.clone()
        for cls_layer in self.cls_module:
            cls_feature = cls_layer(cls_feature)
        for reg_layer in self.reg_module:
            reg_feature = reg_layer(reg_feature)
        class_logits = self.class_logits(cls_feature)
        bboxes_deltas = self.bboxes_delta(reg_feature)
        pred_bboxes = delta2bbox(bboxes_deltas,
                                 bboxes.reshape([-1, 4]), self.bbox_weights)

        return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape(
            [N, nr_boxes, -1]), obj_features


@register
class SparseRCNNHead(nn.Layer):
    '''
    SparsercnnHead
    Args:
        roi_input_shape (list[ShapeSpec]): The output shape of fpn
        num_classes (int): Number of classes,
        head_hidden_dim (int): The param of MultiHeadAttention,
        head_dim_feedforward (int): The param of MultiHeadAttention,
        nhead (int): The param of MultiHeadAttention,
        head_dropout (float): The p of dropout,
        head_cls (int): The number of class head,
        head_reg (int): The number of regressionhead,
        head_num_dynamic (int): The number of DynamicConv's param,
        head_num_heads (int): The number of RCNNHead,
        deep_supervision (int): wheather supervise the intermediate results,
        num_proposals (int): the number of proposals boxes and features
    '''
    __inject__ = ['loss_func']
    __shared__ = ['num_classes']

    def __init__(
            self,
            head_hidden_dim,
            head_dim_feedforward,
            nhead,
            head_dropout,
            head_cls,
            head_reg,
            head_dim_dynamic,
            head_num_dynamic,
            head_num_heads,
            deep_supervision,
            num_proposals,
            num_classes=80,
            loss_func="SparseRCNNLoss",
            roi_input_shape=None, ):
        super().__init__()
        assert head_num_heads > 0, \
            f'At least one RoI Head is required, but {head_num_heads}.'

        # Build RoI.
        box_pooler = self._init_box_pooler(roi_input_shape)
        self.box_pooler = box_pooler

        # Build heads.
        rcnn_head = RCNNHead(
            head_hidden_dim,
            num_classes,
            head_dim_feedforward,
            nhead,
            head_dropout,
            head_cls,
            head_reg,
            head_dim_dynamic,
            head_num_dynamic, )
        self.head_series = nn.LayerList(
            [copy.deepcopy(rcnn_head) for i in range(head_num_heads)])
        self.return_intermediate = deep_supervision

        self.num_classes = num_classes

        # build init proposal
        self.init_proposal_features = nn.Embedding(num_proposals,
                                                   head_hidden_dim)
        self.init_proposal_boxes = nn.Embedding(num_proposals, 4)

        self.lossfunc = loss_func

        # Init parameters.
        init.reset_initialized_parameter(self)
        self._reset_parameters()

    def _reset_parameters(self):
        # init all parameters.
        prior_prob = 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)

        for m in self.sublayers():
            if isinstance(m, nn.Linear):
                init.xavier_normal_(m.weight, reverse=True)
            elif not isinstance(m, nn.Embedding) and hasattr(
                    m, "weight") and m.weight.dim() > 1:
                init.xavier_normal_(m.weight, reverse=False)

            if hasattr(m, "bias") and m.bias is not None and m.bias.shape[
                    -1] == self.num_classes:
                init.constant_(m.bias, bias_value)

        init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight)
        init_bboxes[:, :2] = 0.5
        init_bboxes[:, 2:] = 1.0
        self.init_proposal_boxes.weight.set_value(init_bboxes)

    @staticmethod
    def _init_box_pooler(input_shape):

        pooler_resolution = 7
        sampling_ratio = 2

        if input_shape is not None:
            pooler_scales = tuple(1.0 / input_shape[k].stride
                                  for k in range(len(input_shape)))
            in_channels = [
                input_shape[f].channels for f in range(len(input_shape))
            ]
            end_level = len(input_shape) - 1
            # Check all channel counts are equal
            assert len(set(in_channels)) == 1, in_channels
        else:
            pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0]
            end_level = 3

        aligned = True
        if paddle.device.is_compiled_with_custom_device('npu'):
            aligned = False
        box_pooler = RoIAlign(
            resolution=pooler_resolution,
            spatial_scale=pooler_scales,
            sampling_ratio=sampling_ratio,
            end_level=end_level,
            aligned=aligned)
        return box_pooler

    def forward(self, features, input_whwh):

        bs = len(features[0])
        bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone(
        )).unsqueeze(0)
        bboxes = bboxes * input_whwh.unsqueeze(-2)

        init_features = self.init_proposal_features.weight.unsqueeze(0).tile(
            [1, bs, 1])
        proposal_features = init_features.clone()

        inter_class_logits = []
        inter_pred_bboxes = []

        for stage, rcnn_head in enumerate(self.head_series):
            class_logits, pred_bboxes, proposal_features = rcnn_head(
                features, bboxes, proposal_features, self.box_pooler)

            if self.return_intermediate or stage == len(self.head_series) - 1:
                inter_class_logits.append(class_logits)
                inter_pred_bboxes.append(pred_bboxes)
            bboxes = pred_bboxes.detach()

        output = {
            'pred_logits': inter_class_logits[-1],
            'pred_boxes': inter_pred_bboxes[-1]
        }
        if self.return_intermediate:
            output['aux_outputs'] = [{
                'pred_logits': a,
                'pred_boxes': b
            } for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])]

        return output

    def get_loss(self, outputs, targets):
        losses = self.lossfunc(outputs, targets)
        weight_dict = self.lossfunc.weight_dict

        for k in losses.keys():
            if k in weight_dict:
                losses[k] *= weight_dict[k]

        return losses


def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return paddle.stack(b, axis=-1)


================================================
FILE: ppdet/modeling/heads/ssd_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle.regularizer import L2Decay
from paddle import ParamAttr

from ..layers import AnchorGeneratorSSD
from ..cls_utils import _get_class_default_kwargs


class SepConvLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 padding=1,
                 conv_decay=0.):
        super(SepConvLayer, self).__init__()
        self.dw_conv = nn.Conv2D(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            stride=1,
            padding=padding,
            groups=in_channels,
            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
            bias_attr=False)

        self.bn = nn.BatchNorm2D(
            in_channels,
            weight_attr=ParamAttr(regularizer=L2Decay(0.)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.)))

        self.pw_conv = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
            bias_attr=False)

    def forward(self, x):
        x = self.dw_conv(x)
        x = F.relu6(self.bn(x))
        x = self.pw_conv(x)
        return x


class SSDExtraHead(nn.Layer):
    def __init__(self,
                 in_channels=256,
                 out_channels=([256, 512], [256, 512], [128, 256], [128, 256],
                               [128, 256]),
                 strides=(2, 2, 2, 1, 1),
                 paddings=(1, 1, 1, 0, 0)):
        super(SSDExtraHead, self).__init__()
        self.convs = nn.LayerList()
        for out_channel, stride, padding in zip(out_channels, strides,
                                                paddings):
            self.convs.append(
                self._make_layers(in_channels, out_channel[0], out_channel[1],
                                  stride, padding))
            in_channels = out_channel[-1]

    def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3):
        return nn.Sequential(
            nn.Conv2D(c_in, c_hidden, 1),
            nn.ReLU(),
            nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU())

    def forward(self, x):
        out = [x]
        for conv_layer in self.convs:
            out.append(conv_layer(out[-1]))
        return out


@register
class SSDHead(nn.Layer):
    """
    SSDHead

    Args:
        num_classes (int): Number of classes
        in_channels (list): Number of channels per input feature
        anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance
        kernel_size (int): Conv kernel size
        padding (int): Conv padding
        use_sepconv (bool): Use SepConvLayer if true
        conv_decay (float): Conv regularization coeff
        loss (object): 'SSDLoss' instance
        use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True
    """

    __shared__ = ['num_classes']
    __inject__ = ['anchor_generator', 'loss']

    def __init__(self,
                 num_classes=80,
                 in_channels=(512, 1024, 512, 256, 256, 256),
                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
                 kernel_size=3,
                 padding=1,
                 use_sepconv=False,
                 conv_decay=0.,
                 loss='SSDLoss',
                 use_extra_head=False):
        super(SSDHead, self).__init__()
        # add background class
        self.num_classes = num_classes + 1
        self.in_channels = in_channels
        self.anchor_generator = anchor_generator
        self.loss = loss
        self.use_extra_head = use_extra_head

        if self.use_extra_head:
            self.ssd_extra_head = SSDExtraHead()
            self.in_channels = [256, 512, 512, 256, 256, 256]

        if isinstance(anchor_generator, dict):
            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)

        self.num_priors = self.anchor_generator.num_priors
        self.box_convs = []
        self.score_convs = []
        for i, num_prior in enumerate(self.num_priors):
            box_conv_name = "boxes{}".format(i)
            if not use_sepconv:
                box_conv = self.add_sublayer(
                    box_conv_name,
                    nn.Conv2D(
                        in_channels=self.in_channels[i],
                        out_channels=num_prior * 4,
                        kernel_size=kernel_size,
                        padding=padding))
            else:
                box_conv = self.add_sublayer(
                    box_conv_name,
                    SepConvLayer(
                        in_channels=self.in_channels[i],
                        out_channels=num_prior * 4,
                        kernel_size=kernel_size,
                        padding=padding,
                        conv_decay=conv_decay))
            self.box_convs.append(box_conv)

            score_conv_name = "scores{}".format(i)
            if not use_sepconv:
                score_conv = self.add_sublayer(
                    score_conv_name,
                    nn.Conv2D(
                        in_channels=self.in_channels[i],
                        out_channels=num_prior * self.num_classes,
                        kernel_size=kernel_size,
                        padding=padding))
            else:
                score_conv = self.add_sublayer(
                    score_conv_name,
                    SepConvLayer(
                        in_channels=self.in_channels[i],
                        out_channels=num_prior * self.num_classes,
                        kernel_size=kernel_size,
                        padding=padding,
                        conv_decay=conv_decay))
            self.score_convs.append(score_conv)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def forward(self, feats, image, gt_bbox=None, gt_class=None):
        if self.use_extra_head:
            assert len(feats) == 1, \
                ("If you set use_extra_head=True, backbone feature "
                 "list length should be 1.")
            feats = self.ssd_extra_head(feats[0])
        box_preds = []
        cls_scores = []
        for feat, box_conv, score_conv in zip(feats, self.box_convs,
                                              self.score_convs):
            box_pred = box_conv(feat)
            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
            box_pred = paddle.reshape(box_pred, [0, -1, 4])
            box_preds.append(box_pred)

            cls_score = score_conv(feat)
            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
            cls_scores.append(cls_score)

        prior_boxes = self.anchor_generator(feats, image)

        if self.training:
            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
                                 prior_boxes)
        else:
            return (box_preds, cls_scores), prior_boxes

    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)


================================================
FILE: ppdet/modeling/heads/tood_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant

from ppdet.core.workspace import register
from ..initializer import normal_, constant_, bias_init_with_prob
from ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox
from ..losses import GIoULoss
from ppdet.modeling.layers import ConvNormLayer
from ppdet.modeling.ops import get_static_shape
from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell


class ScaleReg(nn.Layer):
    """
    Parameter for scaling the regression outputs.
    """

    def __init__(self, init_scale=1.):
        super(ScaleReg, self).__init__()
        self.scale_reg = self.create_parameter(
            shape=[1],
            attr=ParamAttr(initializer=Constant(value=init_scale)),
            dtype="float32")

    def forward(self, inputs):
        out = inputs * self.scale_reg
        return out


class TaskDecomposition(nn.Layer):
    """This code is based on
        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
    """

    def __init__(
            self,
            feat_channels,
            stacked_convs,
            la_down_rate=8,
            norm_type='gn',
            norm_groups=32, ):
        super(TaskDecomposition, self).__init__()
        self.feat_channels = feat_channels
        self.stacked_convs = stacked_convs
        self.norm_type = norm_type
        self.norm_groups = norm_groups
        self.in_channels = self.feat_channels * self.stacked_convs
        self.la_conv1 = nn.Conv2D(self.in_channels,
                                  self.in_channels // la_down_rate, 1)
        self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate,
                                  self.stacked_convs, 1)

        self.reduction_conv = ConvNormLayer(
            self.in_channels,
            self.feat_channels,
            filter_size=1,
            stride=1,
            norm_type=self.norm_type,
            norm_groups=self.norm_groups)

        self._init_weights()

    def _init_weights(self):
        normal_(self.la_conv1.weight, std=0.001)
        normal_(self.la_conv2.weight, std=0.001)

    def forward(self, feat, avg_feat):
        feat_shape = get_static_shape(feat)
        b = feat_shape[0:1]
        h = feat_shape[2:3]
        w = feat_shape[3:4]
        weight = F.relu(self.la_conv1(avg_feat))
        weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1)
        feat = paddle.reshape(
            feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight
        feat = self.reduction_conv(feat.flatten(1, 2))
        feat = F.relu(feat)
        return feat


@register
class TOODHead(nn.Layer):
    """This code is based on
        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
    """
    __inject__ = ['nms', 'static_assigner', 'assigner']
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 feat_channels=256,
                 stacked_convs=6,
                 fpn_strides=(8, 16, 32, 64, 128),
                 grid_cell_scale=8,
                 grid_cell_offset=0.5,
                 norm_type='gn',
                 norm_groups=32,
                 static_assigner_epoch=4,
                 use_align_head=True,
                 loss_weight={
                     'class': 1.0,
                     'bbox': 1.0,
                     'iou': 2.0,
                 },
                 nms='MultiClassNMS',
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner'):
        super(TOODHead, self).__init__()
        self.num_classes = num_classes
        self.feat_channels = feat_channels
        self.stacked_convs = stacked_convs
        self.fpn_strides = fpn_strides
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.static_assigner_epoch = static_assigner_epoch
        self.use_align_head = use_align_head
        self.nms = nms
        self.static_assigner = static_assigner
        self.assigner = assigner
        self.loss_weight = loss_weight
        self.giou_loss = GIoULoss()

        self.inter_convs = nn.LayerList()
        for i in range(self.stacked_convs):
            self.inter_convs.append(
                ConvNormLayer(
                    self.feat_channels,
                    self.feat_channels,
                    filter_size=3,
                    stride=1,
                    norm_type=norm_type,
                    norm_groups=norm_groups))

        self.cls_decomp = TaskDecomposition(
            self.feat_channels,
            self.stacked_convs,
            self.stacked_convs * 8,
            norm_type=norm_type,
            norm_groups=norm_groups)
        self.reg_decomp = TaskDecomposition(
            self.feat_channels,
            self.stacked_convs,
            self.stacked_convs * 8,
            norm_type=norm_type,
            norm_groups=norm_groups)

        self.tood_cls = nn.Conv2D(
            self.feat_channels, self.num_classes, 3, padding=1)
        self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1)

        if self.use_align_head:
            self.cls_prob_conv1 = nn.Conv2D(self.feat_channels *
                                            self.stacked_convs,
                                            self.feat_channels // 4, 1)
            self.cls_prob_conv2 = nn.Conv2D(
                self.feat_channels // 4, 1, 3, padding=1)
            self.reg_offset_conv1 = nn.Conv2D(self.feat_channels *
                                              self.stacked_convs,
                                              self.feat_channels // 4, 1)
            self.reg_offset_conv2 = nn.Conv2D(
                self.feat_channels // 4, 4 * 2, 3, padding=1)

        self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides])

        self._init_weights()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'feat_channels': input_shape[0].channels,
            'fpn_strides': [i.stride for i in input_shape],
        }

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        normal_(self.tood_cls.weight, std=0.01)
        constant_(self.tood_cls.bias, bias_cls)
        normal_(self.tood_reg.weight, std=0.01)

        if self.use_align_head:
            normal_(self.cls_prob_conv1.weight, std=0.01)
            normal_(self.cls_prob_conv2.weight, std=0.01)
            constant_(self.cls_prob_conv2.bias, bias_cls)
            normal_(self.reg_offset_conv1.weight, std=0.001)
            constant_(self.reg_offset_conv2.weight)
            constant_(self.reg_offset_conv2.bias)

    def _reg_grid_sample(self, feat, offset, anchor_points):
        feat_shape = get_static_shape(feat)
        b = feat_shape[0:1]
        h = feat_shape[2:3]
        w = feat_shape[3:4]
        feat = paddle.reshape(feat, [-1, 1, h, w])
        offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1])
        grid_shape = paddle.concat([w, h]).astype('float32')
        grid = (offset + anchor_points) / grid_shape
        grid = 2 * grid.clip(0., 1.) - 1
        feat = F.grid_sample(feat, grid)
        feat = paddle.reshape(feat, [b, -1, h, w])
        return feat

    def forward(self, feats):
        assert len(feats) == len(self.fpn_strides), \
            "The size of feats is not equal to size of fpn_strides"

        anchors, anchor_points, num_anchors_list, stride_tensor =\
            generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale,
            self.grid_cell_offset)
        anchor_centers_split = paddle.split(anchor_points / stride_tensor,
                                            num_anchors_list)

        cls_score_list, bbox_pred_list = [], []
        for feat, scale_reg, anchor_centers, stride in zip(
                feats, self.scales_regs, anchor_centers_split,
                self.fpn_strides):
            b, _, h, w = get_static_shape(feat)
            inter_feats = []
            for inter_conv in self.inter_convs:
                feat = F.relu(inter_conv(feat))
                inter_feats.append(feat)
            feat = paddle.concat(inter_feats, axis=1)

            # task decomposition
            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
            cls_feat = self.cls_decomp(feat, avg_feat)
            reg_feat = self.reg_decomp(feat, avg_feat)

            # cls prediction and alignment
            cls_logits = self.tood_cls(cls_feat)
            if self.use_align_head:
                cls_prob = F.relu(self.cls_prob_conv1(feat))
                cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob))
                cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt()
            else:
                cls_score = F.sigmoid(cls_logits)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))

            # reg prediction and alignment
            reg_dist = scale_reg(self.tood_reg(reg_feat).exp())
            reg_dist = reg_dist.flatten(2).transpose([0, 2, 1])
            reg_bbox = batch_distance2bbox(
                anchor_centers.unsqueeze(0), reg_dist)
            if self.use_align_head:
                reg_offset = F.relu(self.reg_offset_conv1(feat))
                reg_offset = self.reg_offset_conv2(reg_offset)
                reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w])
                anchor_centers = anchor_centers.reshape([1, h, w, 2])
                bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset,
                                                  anchor_centers)
                bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
            else:
                bbox_pred = reg_bbox

            if not self.training:
                bbox_pred *= stride
            bbox_pred_list.append(bbox_pred)
        cls_score_list = paddle.concat(cls_score_list, axis=1)
        bbox_pred_list = paddle.concat(bbox_pred_list, axis=1)

        return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor

    @staticmethod
    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
        weight = (score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = F.binary_cross_entropy(
            score, label, weight=weight, reduction='sum')
        return loss

    def get_loss(self, head_outs, gt_meta):
        pred_scores, pred_bboxes, anchors, \
        num_anchors_list, stride_tensor = head_outs
        gt_labels = gt_meta['gt_class']
        gt_bboxes = gt_meta['gt_bbox']
        pad_gt_mask = gt_meta['pad_gt_mask']
        # label assignment
        if gt_meta['epoch_id'] < self.static_assigner_epoch:
            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
                anchors,
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes)
            alpha_l = 0.25
        else:
            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
                pred_scores.detach(),
                pred_bboxes.detach() * stride_tensor,
                bbox_center(anchors),
                num_anchors_list,
                gt_labels,
                gt_bboxes,
                pad_gt_mask,
                bg_index=self.num_classes)
            alpha_l = -1

        # rescale bbox
        assigned_bboxes /= stride_tensor
        # classification loss
        loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l)
        # select positive samples mask
        mask_positive = (assigned_labels != self.num_classes)
        num_pos = mask_positive.astype(paddle.float32).sum()
        # bbox regression loss
        if num_pos > 0:
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
                                                   bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = paddle.masked_select(
                assigned_bboxes, bbox_mask).reshape([-1, 4])
            bbox_weight = paddle.masked_select(
                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
            # iou loss
            loss_iou = self.giou_loss(pred_bboxes_pos,
                                      assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / bbox_weight.sum()
            # l1 loss
            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
        else:
            loss_iou = paddle.zeros([])
            loss_l1 = paddle.zeros([])

        loss_cls /= assigned_scores.sum().clip(min=1)
        loss = self.loss_weight['class'] * loss_cls + self.loss_weight[
            'iou'] * loss_iou

        return {
            'loss': loss,
            'loss_class': loss_cls,
            'loss_iou': loss_iou,
            'loss_l1': loss_l1
        }

    def post_process(self, head_outs, img_shape, scale_factor):
        pred_scores, pred_bboxes, _, _, _ = head_outs
        pred_scores = pred_scores.transpose([0, 2, 1])

        for i in range(len(pred_bboxes)):
            pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip(
                min=0, max=img_shape[i, 1])
            pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip(
                min=0, max=img_shape[i, 0])
            pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip(
                min=0, max=img_shape[i, 1])
            pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip(
                min=0, max=img_shape[i, 0])
        # scale bbox to origin
        scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
        pred_bboxes /= scale_factor
        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
        return bbox_pred, bbox_num


================================================
FILE: ppdet/modeling/heads/ttf_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant, Normal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ppdet.modeling.layers import DeformableConvV2, LiteConv
import numpy as np


@register
class HMHead(nn.Layer):
    """
    Args:
        ch_in (int): The channel number of input Tensor.
        ch_out (int): The channel number of output Tensor.
        num_classes (int): Number of classes.
        conv_num (int): The convolution number of hm_feat.
        dcn_head(bool): whether use dcn in head. False by default. 
        lite_head(bool): whether use lite version. False by default.
        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
            bn by default

    Return:
        Heatmap head output
    """
    __shared__ = ['num_classes', 'norm_type']

    def __init__(
            self,
            ch_in,
            ch_out=128,
            num_classes=80,
            conv_num=2,
            dcn_head=False,
            lite_head=False,
            norm_type='bn', ):
        super(HMHead, self).__init__()
        head_conv = nn.Sequential()
        for i in range(conv_num):
            name = 'conv.{}'.format(i)
            if lite_head:
                lite_name = 'hm.' + name
                head_conv.add_sublayer(
                    lite_name,
                    LiteConv(
                        in_channels=ch_in if i == 0 else ch_out,
                        out_channels=ch_out,
                        norm_type=norm_type))
            else:
                if dcn_head:
                    head_conv.add_sublayer(
                        name,
                        DeformableConvV2(
                            in_channels=ch_in if i == 0 else ch_out,
                            out_channels=ch_out,
                            kernel_size=3,
                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
                else:
                    head_conv.add_sublayer(
                        name,
                        nn.Conv2D(
                            in_channels=ch_in if i == 0 else ch_out,
                            out_channels=ch_out,
                            kernel_size=3,
                            padding=1,
                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
                            bias_attr=ParamAttr(
                                learning_rate=2., regularizer=L2Decay(0.))))
                head_conv.add_sublayer(name + '.act', nn.ReLU())
        self.feat = head_conv
        bias_init = float(-np.log((1 - 0.01) / 0.01))
        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
                                                                          0.01))
        self.head = nn.Conv2D(
            in_channels=ch_out,
            out_channels=num_classes,
            kernel_size=1,
            weight_attr=weight_attr,
            bias_attr=ParamAttr(
                learning_rate=2.,
                regularizer=L2Decay(0.),
                initializer=Constant(bias_init)))

    def forward(self, feat):
        out = self.feat(feat)
        out = self.head(out)
        return out


@register
class WHHead(nn.Layer):
    """
    Args:
        ch_in (int): The channel number of input Tensor.
        ch_out (int): The channel number of output Tensor.
        conv_num (int): The convolution number of wh_feat.
        dcn_head(bool): whether use dcn in head. False by default.
        lite_head(bool): whether use lite version. False by default.
        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
            bn by default
    Return:
        Width & Height head output
    """
    __shared__ = ['norm_type']

    def __init__(self,
                 ch_in,
                 ch_out=64,
                 conv_num=2,
                 dcn_head=False,
                 lite_head=False,
                 norm_type='bn'):
        super(WHHead, self).__init__()
        head_conv = nn.Sequential()
        for i in range(conv_num):
            name = 'conv.{}'.format(i)
            if lite_head:
                lite_name = 'wh.' + name
                head_conv.add_sublayer(
                    lite_name,
                    LiteConv(
                        in_channels=ch_in if i == 0 else ch_out,
                        out_channels=ch_out,
                        norm_type=norm_type))
            else:
                if dcn_head:
                    head_conv.add_sublayer(
                        name,
                        DeformableConvV2(
                            in_channels=ch_in if i == 0 else ch_out,
                            out_channels=ch_out,
                            kernel_size=3,
                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
                else:
                    head_conv.add_sublayer(
                        name,
                        nn.Conv2D(
                            in_channels=ch_in if i == 0 else ch_out,
                            out_channels=ch_out,
                            kernel_size=3,
                            padding=1,
                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
                            bias_attr=ParamAttr(
                                learning_rate=2., regularizer=L2Decay(0.))))
                head_conv.add_sublayer(name + '.act', nn.ReLU())

        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
                                                                          0.01))
        self.feat = head_conv
        self.head = nn.Conv2D(
            in_channels=ch_out,
            out_channels=4,
            kernel_size=1,
            weight_attr=weight_attr,
            bias_attr=ParamAttr(
                learning_rate=2., regularizer=L2Decay(0.)))

    def forward(self, feat):
        out = self.feat(feat)
        out = self.head(out)
        out = F.relu(out)
        return out


@register
class TTFHead(nn.Layer):
    """
    TTFHead
    Args:
        in_channels (int): the channel number of input to TTFHead.
        num_classes (int): the number of classes, 80 by default.
        hm_head_planes (int): the channel number in heatmap head,
            128 by default.
        wh_head_planes (int): the channel number in width & height head,
            64 by default.
        hm_head_conv_num (int): the number of convolution in heatmap head,
            2 by default.
        wh_head_conv_num (int): the number of convolution in width & height
            head, 2 by default.
        hm_loss (object): Instance of 'CTFocalLoss'.
        wh_loss (object): Instance of 'GIoULoss'.
        wh_offset_base (float): the base offset of width and height,
            16.0 by default.
        down_ratio (int): the actual down_ratio is calculated by base_down_ratio
            (default 16) and the number of upsample layers.
        lite_head(bool): whether use lite version. False by default.
        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
            bn by default
        ags_module(bool): whether use AGS module to reweight location feature.
            false by default.

    """

    __shared__ = ['num_classes', 'down_ratio', 'norm_type']
    __inject__ = ['hm_loss', 'wh_loss']

    def __init__(self,
                 in_channels,
                 num_classes=80,
                 hm_head_planes=128,
                 wh_head_planes=64,
                 hm_head_conv_num=2,
                 wh_head_conv_num=2,
                 hm_loss='CTFocalLoss',
                 wh_loss='GIoULoss',
                 wh_offset_base=16.,
                 down_ratio=4,
                 dcn_head=False,
                 lite_head=False,
                 norm_type='bn',
                 ags_module=False):
        super(TTFHead, self).__init__()
        self.in_channels = in_channels
        self.hm_head = HMHead(in_channels, hm_head_planes, num_classes,
                              hm_head_conv_num, dcn_head, lite_head, norm_type)
        self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num,
                              dcn_head, lite_head, norm_type)
        self.hm_loss = hm_loss
        self.wh_loss = wh_loss

        self.wh_offset_base = wh_offset_base
        self.down_ratio = down_ratio
        self.ags_module = ags_module

    @classmethod
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channels': input_shape.channels, }

    def forward(self, feats):
        hm = self.hm_head(feats)
        wh = self.wh_head(feats) * self.wh_offset_base
        return hm, wh

    def filter_box_by_weight(self, pred, target, weight):
        """
        Filter out boxes where ttf_reg_weight is 0, only keep positive samples.
        """
        index = paddle.nonzero(weight > 0)
        index.stop_gradient = True
        weight = paddle.gather_nd(weight, index)
        pred = paddle.gather_nd(pred, index)
        target = paddle.gather_nd(target, index)
        return pred, target, weight

    def filter_loc_by_weight(self, score, weight):
        index = paddle.nonzero(weight > 0)
        index.stop_gradient = True
        score = paddle.gather_nd(score, index)
        return score

    def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight):
        pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4)
        hm_loss = self.hm_loss(pred_hm, target_hm)
        H, W = target_hm.shape[2:]
        mask = paddle.reshape(target_weight, [-1, H, W])
        avg_factor = paddle.sum(mask) + 1e-4

        base_step = self.down_ratio
        shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32')
        shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32')
        shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x])
        base_loc = paddle.stack([shift_x, shift_y], axis=0)
        base_loc.stop_gradient = True

        pred_boxes = paddle.concat(
            [0 - pred_wh[:, 0:2, :, :] + base_loc.astype(pred_wh.dtype), pred_wh[:, 2:4] + base_loc.astype(pred_wh.dtype)],
            axis=1)
        pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1])
        boxes = paddle.transpose(box_target, [0, 2, 3, 1])
        boxes.stop_gradient = True

        if self.ags_module:
            pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True)
            pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1)
            pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax,
                                                   [0, 2, 3, 1])
            pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax,
                                                            mask)
        else:
            pred_hm_max_softmax = None

        pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes,
                                                            mask)
        mask.stop_gradient = True
        wh_loss = self.wh_loss(
            pred_boxes,
            boxes,
            iou_weight=mask.unsqueeze(1),
            loc_reweight=pred_hm_max_softmax)
        wh_loss = wh_loss / avg_factor

        ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss}
        return ttf_loss


================================================
FILE: ppdet/modeling/heads/vitpose_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ppdet.modeling.keypoint_utils import resize, flip_back
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d

trunc_normal_ = TruncatedNormal(std=.02)
normal_ = Normal(std=0.001)
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)

__all__ = ['TopdownHeatmapSimpleHead']


@register
class TopdownHeatmapSimpleHead(nn.Layer):
    def __init__(self,
                 in_channels=768,
                 out_channels=17,
                 num_deconv_layers=3,
                 num_deconv_filters=(256, 256, 256),
                 num_deconv_kernels=(4, 4, 4),
                 extra=None,
                 in_index=0,
                 input_transform=None,
                 align_corners=False,
                 upsample=0,
                 flip_pairs=None,
                 shift_heatmap=False,
                 target_type='GaussianHeatmap'):
        super(TopdownHeatmapSimpleHead, self).__init__()

        self.in_channels = in_channels
        self.upsample = upsample
        self.flip_pairs = flip_pairs
        self.shift_heatmap = shift_heatmap
        self.target_type = target_type

        self._init_inputs(in_channels, in_index, input_transform)
        self.in_index = in_index
        self.align_corners = align_corners

        if extra is not None and not isinstance(extra, dict):
            raise TypeError('extra should be dict or None.')

        if num_deconv_layers > 0:
            self.deconv_layers = self._make_deconv_layer(
                num_deconv_layers,
                num_deconv_filters,
                num_deconv_kernels, )
        elif num_deconv_layers == 0:
            self.deconv_layers = nn.Identity()
        else:
            raise ValueError(
                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')

        identity_final_layer = False
        if extra is not None and 'final_conv_kernel' in extra:
            assert extra['final_conv_kernel'] in [0, 1, 3]
            if extra['final_conv_kernel'] == 3:
                padding = 1
            elif extra['final_conv_kernel'] == 1:
                padding = 0
            else:
                # 0 for Identity mapping.
                identity_final_layer = True
            kernel_size = extra['final_conv_kernel']
        else:
            kernel_size = 1
            padding = 0

        if identity_final_layer:
            self.final_layer = nn.Identity()
        else:
            conv_channels = num_deconv_filters[
                -1] if num_deconv_layers > 0 else self.in_channels

            layers = []
            if extra is not None:
                num_conv_layers = extra.get('num_conv_layers', 0)
                num_conv_kernels = extra.get('num_conv_kernels',
                                             [1] * num_conv_layers)

                for i in range(num_conv_layers):
                    layers.append(
                        nn.Conv2D(
                            in_channels=conv_channels,
                            out_channels=conv_channels,
                            kernel_size=num_conv_kernels[i],
                            stride=1,
                            padding=(num_conv_kernels[i] - 1) // 2))
                    layers.append(nn.BatchNorm2D(conv_channels))
                    layers.append(nn.ReLU())

            layers.append(
                nn.Conv2D(
                    in_channels=conv_channels,
                    out_channels=out_channels,
                    kernel_size=kernel_size,
                    stride=1,
                    padding=(padding, padding)))

            if len(layers) > 1:
                self.final_layer = nn.Sequential(*layers)
            else:
                self.final_layer = layers[0]

        self.init_weights()

    @staticmethod
    def _get_deconv_cfg(deconv_kernel):
        """Get configurations for deconv layers."""
        if deconv_kernel == 4:
            padding = 1
            output_padding = 0
        elif deconv_kernel == 3:
            padding = 1
            output_padding = 1
        elif deconv_kernel == 2:
            padding = 0
            output_padding = 0
        else:
            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')

        return deconv_kernel, padding, output_padding

    def _init_inputs(self, in_channels, in_index, input_transform):
        """Check and initialize input transforms.
        """

        if input_transform is not None:
            assert input_transform in ['resize_concat', 'multiple_select']
        self.input_transform = input_transform
        self.in_index = in_index
        if input_transform is not None:
            assert isinstance(in_channels, (list, tuple))
            assert isinstance(in_index, (list, tuple))
            assert len(in_channels) == len(in_index)
            if input_transform == 'resize_concat':
                self.in_channels = sum(in_channels)
            else:
                self.in_channels = in_channels
        else:
            assert isinstance(in_channels, int)
            assert isinstance(in_index, int)
            self.in_channels = in_channels

    def _transform_inputs(self, inputs):
        """Transform inputs for decoder.
        """
        if not isinstance(inputs, list):
            if not isinstance(inputs, list):

                if self.upsample > 0:
                    inputs = resize(
                        input=F.relu(inputs),
                        scale_factor=self.upsample,
                        mode='bilinear',
                        align_corners=self.align_corners)
            return inputs

        if self.input_transform == 'resize_concat':
            inputs = [inputs[i] for i in self.in_index]
            upsampled_inputs = [
                resize(
                    input=x,
                    size=inputs[0].shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for x in inputs
            ]
            inputs = paddle.concat(upsampled_inputs, dim=1)
        elif self.input_transform == 'multiple_select':
            inputs = [inputs[i] for i in self.in_index]
        else:
            inputs = inputs[self.in_index]

        return inputs

    def forward(self, x):
        """Forward function."""
        x = self._transform_inputs(x)
        x = self.deconv_layers(x)
        x = self.final_layer(x)

        return x

    def inference_model(self, x, flip_pairs=None):
        """Inference function.

        Returns:
            output_heatmap (np.ndarray): Output heatmaps.

        Args:
            x (torch.Tensor[N,K,H,W]): Input features.
            flip_pairs (None | list[tuple]):
                Pairs of keypoints which are mirrored.
        """
        output = self.forward(x)

        if flip_pairs is not None:
            output_heatmap = flip_back(
                output, self.flip_pairs, target_type=self.target_type)
            # feature is not aligned, shift flipped heatmap for higher accuracy
            if self.shift_heatmap:
                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
        else:
            output_heatmap = output
        return output_heatmap

    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
        """Make deconv layers."""
        if num_layers != len(num_filters):
            error_msg = f'num_layers({num_layers}) ' \
                        f'!= length of num_filters({len(num_filters)})'
            raise ValueError(error_msg)
        if num_layers != len(num_kernels):
            error_msg = f'num_layers({num_layers}) ' \
                        f'!= length of num_kernels({len(num_kernels)})'
            raise ValueError(error_msg)

        layers = []
        for i in range(num_layers):
            kernel, padding, output_padding = \
                self._get_deconv_cfg(num_kernels[i])

            planes = num_filters[i]
            layers.append(
                ConvTranspose2d(
                    in_channels=self.in_channels,
                    out_channels=planes,
                    kernel_size=kernel,
                    stride=2,
                    padding=padding,
                    output_padding=output_padding,
                    bias=False))
            layers.append(nn.BatchNorm2D(planes))
            layers.append(nn.ReLU())
            self.in_channels = planes

        return nn.Sequential(*layers)

    def init_weights(self):
        """Initialize model weights."""
        if not isinstance(self.deconv_layers, nn.Identity):

            for m in self.deconv_layers:
                if isinstance(m, nn.BatchNorm2D):
                    ones_(m.weight)
                    ones_(m.bias)
        if not isinstance(self.final_layer, nn.Conv2D):

            for m in self.final_layer:
                if isinstance(m, nn.Conv2D):
                    normal_(m.weight)
                    zeros_(m.bias)
                elif isinstance(m, nn.BatchNorm2D):
                    ones_(m.weight)
                    ones_(m.bias)
        else:
            normal_(self.final_layer.weight)
            zeros_(self.final_layer.bias)


================================================
FILE: ppdet/modeling/heads/yolo_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register

import math
import numpy as np
from ..initializer import bias_init_with_prob, constant_
from ..backbones.csp_darknet import BaseConv, DWConv
from ..losses import IouLoss
from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner
from ppdet.modeling.bbox_utils import bbox_overlaps
from ppdet.modeling.layers import MultiClassNMS

__all__ = ['YOLOv3Head', 'YOLOXHead']


def _de_sigmoid(x, eps=1e-7):
    x = paddle.clip(x, eps, 1. / eps)
    x = paddle.clip(1. / x - 1., eps, 1. / eps)
    x = -paddle.log(x)
    return x


@register
class YOLOv3Head(nn.Layer):
    __shared__ = ['num_classes', 'data_format']
    __inject__ = ['loss']

    def __init__(self,
                 in_channels=[1024, 512, 256],
                 anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                          [59, 119], [116, 90], [156, 198], [373, 326]],
                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
                 num_classes=80,
                 loss='YOLOv3Loss',
                 iou_aware=False,
                 iou_aware_factor=0.4,
                 data_format='NCHW'):
        """
        Head for YOLOv3 network

        Args:
            num_classes (int): number of foreground classes
            anchors (list): anchors
            anchor_masks (list): anchor masks
            loss (object): YOLOv3Loss instance
            iou_aware (bool): whether to use iou_aware
            iou_aware_factor (float): iou aware factor
            data_format (str): data format, NCHW or NHWC
        """
        super(YOLOv3Head, self).__init__()
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.loss = loss

        self.iou_aware = iou_aware
        self.iou_aware_factor = iou_aware_factor

        self.parse_anchor(anchors, anchor_masks)
        self.num_outputs = len(self.anchors)
        self.data_format = data_format

        self.yolo_outputs = []
        for i in range(len(self.anchors)):

            if self.iou_aware:
                num_filters = len(self.anchors[i]) * (self.num_classes + 6)
            else:
                num_filters = len(self.anchors[i]) * (self.num_classes + 5)
            name = 'yolo_output.{}'.format(i)
            conv = nn.Conv2D(
                in_channels=self.in_channels[i],
                out_channels=num_filters,
                kernel_size=1,
                stride=1,
                padding=0,
                data_format=data_format,
                bias_attr=ParamAttr(regularizer=L2Decay(0.)))
            conv.skip_quant = True
            yolo_output = self.add_sublayer(name, conv)
            self.yolo_outputs.append(yolo_output)

    def parse_anchor(self, anchors, anchor_masks):
        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
        self.mask_anchors = []
        anchor_num = len(anchors)
        for masks in anchor_masks:
            self.mask_anchors.append([])
            for mask in masks:
                assert mask < anchor_num, "anchor mask index overflow"
                self.mask_anchors[-1].extend(anchors[mask])

    def forward(self, feats, targets=None):
        assert len(feats) == len(self.anchors)
        yolo_outputs = []
        for i, feat in enumerate(feats):
            yolo_output = self.yolo_outputs[i](feat)
            if self.data_format == 'NHWC':
                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
            yolo_outputs.append(yolo_output)

        if self.training:
            return self.loss(yolo_outputs, targets, self.anchors)
        else:
            if self.iou_aware:
                y = []
                for i, out in enumerate(yolo_outputs):
                    na = len(self.anchors[i])
                    ioup, x = out[:, 0:na, :, :], out[:, na:, :, :]
                    b, c, h, w = x.shape
                    no = c // na
                    x = x.reshape((b, na, no, h * w))
                    ioup = ioup.reshape((b, na, 1, h * w))
                    obj = x[:, :, 4:5, :]
                    ioup = F.sigmoid(ioup)
                    obj = F.sigmoid(obj)
                    obj_t = (obj**(1 - self.iou_aware_factor)) * (
                        ioup**self.iou_aware_factor)
                    obj_t = _de_sigmoid(obj_t)
                    loc_t = x[:, :, :4, :]
                    cls_t = x[:, :, 5:, :]
                    y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2)
                    y_t = y_t.reshape((b, c, h, w))
                    y.append(y_t)
                return y
            else:
                return yolo_outputs

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }


@register
class YOLOXHead(nn.Layer):
    __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms']
    __inject__ = ['assigner', 'nms']

    def __init__(self,
                 num_classes=80,
                 width_mult=1.0,
                 depthwise=False,
                 in_channels=[256, 512, 1024],
                 feat_channels=256,
                 fpn_strides=(8, 16, 32),
                 l1_epoch=285,
                 act='silu',
                 assigner=SimOTAAssigner(use_vfl=False),
                 nms='MultiClassNMS',
                 loss_weight={
                     'cls': 1.0,
                     'obj': 1.0,
                     'iou': 5.0,
                     'l1': 1.0,
                 },
                 trt=False,
                 exclude_nms=False):
        super(YOLOXHead, self).__init__()
        self._dtype = paddle.framework.get_default_dtype()
        self.num_classes = num_classes
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels
        feat_channels = int(feat_channels * width_mult)
        self.fpn_strides = fpn_strides
        self.l1_epoch = l1_epoch
        self.assigner = assigner
        self.nms = nms
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.exclude_nms = exclude_nms
        self.loss_weight = loss_weight
        self.iou_loss = IouLoss(loss_weight=1.0)  # default loss_weight 2.5

        ConvBlock = DWConv if depthwise else BaseConv

        self.stem_conv = nn.LayerList()
        self.conv_cls = nn.LayerList()
        self.conv_reg = nn.LayerList()  # reg [x,y,w,h] + obj
        for in_c in self.in_channels:
            self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act))

            self.conv_cls.append(
                nn.Sequential(* [
                    ConvBlock(
                        feat_channels, feat_channels, 3, 1, act=act), ConvBlock(
                            feat_channels, feat_channels, 3, 1, act=act),
                    nn.Conv2D(
                        feat_channels,
                        self.num_classes,
                        1,
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
                ]))

            self.conv_reg.append(
                nn.Sequential(* [
                    ConvBlock(
                        feat_channels, feat_channels, 3, 1, act=act),
                    ConvBlock(
                        feat_channels, feat_channels, 3, 1, act=act),
                    nn.Conv2D(
                        feat_channels,
                        4 + 1,  # reg [x,y,w,h] + obj
                        1,
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
                ]))

        self._init_weights()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype)
        bias_reg[:2] = 0.
        bias_reg[-1] = bias_cls
        for cls_, reg_ in zip(self.conv_cls, self.conv_reg):
            constant_(cls_[-1].weight)
            constant_(cls_[-1].bias, bias_cls)
            constant_(reg_[-1].weight)
            reg_[-1].bias.set_value(bias_reg)

    def _generate_anchor_point(self, feat_sizes, strides, offset=0.):
        anchor_points, stride_tensor = [], []
        num_anchors_list = []
        for feat_size, stride in zip(feat_sizes, strides):
            h, w = feat_size
            x = (paddle.arange(w) + offset) * stride
            y = (paddle.arange(h) + offset) * stride
            y, x = paddle.meshgrid(y, x)
            anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))
            stride_tensor.append(
                paddle.full(
                    [len(anchor_points[-1]), 1], stride, dtype=self._dtype))
            num_anchors_list.append(len(anchor_points[-1]))
        anchor_points = paddle.concat(anchor_points).astype(self._dtype)
        anchor_points.stop_gradient = True
        stride_tensor = paddle.concat(stride_tensor)
        stride_tensor.stop_gradient = True
        return anchor_points, stride_tensor, num_anchors_list

    def forward(self, feats, targets=None):
        assert len(feats) == len(self.fpn_strides), \
            "The size of feats is not equal to size of fpn_strides"

        feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]
        cls_score_list, reg_pred_list = [], []
        obj_score_list = []
        for i, feat in enumerate(feats):
            feat = self.stem_conv[i](feat)
            cls_logit = self.conv_cls[i](feat)
            reg_pred = self.conv_reg[i](feat)
            # cls prediction
            cls_score = F.sigmoid(cls_logit)
            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
            # reg prediction
            reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1)
            reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1])
            reg_pred_list.append(reg_xywh)
            # obj prediction
            obj_score = F.sigmoid(obj_logit)
            obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1]))

        cls_score_list = paddle.concat(cls_score_list, axis=1)
        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
        obj_score_list = paddle.concat(obj_score_list, axis=1)

        # bbox decode
        anchor_points, stride_tensor, _ =\
            self._generate_anchor_point(feat_sizes, self.fpn_strides)
        reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1)
        reg_xy += (anchor_points / stride_tensor)
        reg_wh = paddle.exp(reg_wh) * 0.5
        bbox_pred_list = paddle.concat(
            [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1)

        if self.training:
            anchor_points, stride_tensor, num_anchors_list =\
                self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5)
            yolox_losses = self.get_loss([
                cls_score_list, bbox_pred_list, obj_score_list, anchor_points,
                stride_tensor, num_anchors_list
            ], targets)
            return yolox_losses
        else:
            pred_scores = (cls_score_list * obj_score_list).sqrt()
            return pred_scores, bbox_pred_list, stride_tensor

    def get_loss(self, head_outs, targets):
        pred_cls, pred_bboxes, pred_obj,\
        anchor_points, stride_tensor, num_anchors_list = head_outs
        gt_labels = targets['gt_class']
        gt_bboxes = targets['gt_bbox']
        pred_scores = (pred_cls * pred_obj).sqrt()
        # label assignment
        center_and_strides = paddle.concat(
            [anchor_points, stride_tensor, stride_tensor], axis=-1)
        pos_num_list, label_list, bbox_target_list = [], [], []
        for pred_score, pred_bbox, gt_box, gt_label in zip(
                pred_scores.detach(),
                pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels):
            pos_num, label, _, bbox_target = self.assigner(
                pred_score, center_and_strides, pred_bbox, gt_box, gt_label)
            pos_num_list.append(pos_num)
            label_list.append(label)
            bbox_target_list.append(bbox_target)
        labels = paddle.to_tensor(np.stack(label_list, axis=0))
        bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0))
        bbox_targets /= stride_tensor  # rescale bbox

        # 1. obj score loss
        mask_positive = (labels != self.num_classes)
        loss_obj = F.binary_cross_entropy(
            pred_obj,
            mask_positive.astype(pred_obj.dtype).unsqueeze(-1),
            reduction='sum')

        num_pos = sum(pos_num_list)

        if num_pos > 0:
            num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1)
            loss_obj /= num_pos

            # 2. iou loss
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
                                                   bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = paddle.masked_select(
                bbox_targets, bbox_mask).reshape([-1, 4])
            bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos)
            bbox_iou = paddle.diag(bbox_iou)

            loss_iou = self.iou_loss(
                pred_bboxes_pos.split(
                    4, axis=-1),
                assigned_bboxes_pos.split(
                    4, axis=-1))
            loss_iou = loss_iou.sum() / num_pos

            # 3. cls loss
            cls_mask = mask_positive.unsqueeze(-1).tile(
                [1, 1, self.num_classes])
            pred_cls_pos = paddle.masked_select(
                pred_cls, cls_mask).reshape([-1, self.num_classes])
            assigned_cls_pos = paddle.masked_select(labels, mask_positive)
            assigned_cls_pos = F.one_hot(assigned_cls_pos,
                                         self.num_classes + 1)[..., :-1]
            assigned_cls_pos *= bbox_iou.unsqueeze(-1)
            loss_cls = F.binary_cross_entropy(
                pred_cls_pos, assigned_cls_pos, reduction='sum')
            loss_cls /= num_pos

            # 4. l1 loss
            if targets['epoch_id'] >= self.l1_epoch:
                loss_l1 = F.l1_loss(
                    pred_bboxes_pos, assigned_bboxes_pos, reduction='sum')
                loss_l1 /= num_pos
            else:
                loss_l1 = paddle.zeros([])
                loss_l1.stop_gradient = False
        else:
            loss_cls = paddle.zeros([])
            loss_iou = paddle.zeros([])
            loss_l1 = paddle.zeros([])
            loss_cls.stop_gradient = False
            loss_iou.stop_gradient = False
            loss_l1.stop_gradient = False

        loss = self.loss_weight['obj'] * loss_obj + \
               self.loss_weight['cls'] * loss_cls + \
               self.loss_weight['iou'] * loss_iou

        if targets['epoch_id'] >= self.l1_epoch:
            loss += (self.loss_weight['l1'] * loss_l1)

        yolox_losses = {
            'loss': loss,
            'loss_cls': loss_cls,
            'loss_obj': loss_obj,
            'loss_iou': loss_iou,
            'loss_l1': loss_l1,
        }
        return yolox_losses

    def post_process(self, head_outs, img_shape, scale_factor):
        pred_scores, pred_bboxes, stride_tensor = head_outs
        pred_scores = pred_scores.transpose([0, 2, 1])
        pred_bboxes *= stride_tensor
        # scale bbox to origin image
        scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
        pred_bboxes /= scale_factor
        if self.exclude_nms:
            # `exclude_nms=True` just use in benchmark
            return pred_bboxes.sum(), pred_scores.sum()
        else:
            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
            return bbox_pred, bbox_num


================================================
FILE: ppdet/modeling/heads/yolof_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant

from ppdet.modeling.layers import MultiClassNMS
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import delta2bbox_v2

__all__ = ['YOLOFHead']

INF = 1e8


def reduce_mean(tensor):
    world_size = paddle.distributed.get_world_size()
    if world_size == 1:
        return tensor
    paddle.distributed.all_reduce(tensor)
    return tensor / world_size


def find_inside_anchor(feat_size, stride, num_anchors, im_shape):
    feat_h, feat_w = feat_size[:2]
    im_h, im_w = im_shape[:2]
    inside_h = min(int(np.ceil(im_h / stride)), feat_h)
    inside_w = min(int(np.ceil(im_w / stride)), feat_w)
    inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool)
    inside_mask[:inside_h, :inside_w] = True
    inside_mask = inside_mask.unsqueeze(-1).expand(
        [feat_h, feat_w, num_anchors])
    return inside_mask.reshape([-1])


@register
class YOLOFFeat(nn.Layer):
    def __init__(self,
                 feat_in=256,
                 feat_out=256,
                 num_cls_convs=2,
                 num_reg_convs=4,
                 norm_type='bn'):
        super(YOLOFFeat, self).__init__()
        assert norm_type == 'bn', "YOLOFFeat only support BN now."
        self.feat_in = feat_in
        self.feat_out = feat_out
        self.num_cls_convs = num_cls_convs
        self.num_reg_convs = num_reg_convs
        self.norm_type = norm_type

        cls_subnet, reg_subnet = [], []
        for i in range(self.num_cls_convs):
            feat_in = self.feat_in if i == 0 else self.feat_out
            cls_subnet.append(
                nn.Conv2D(
                    feat_in,
                    self.feat_out,
                    3,
                    stride=1,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(
                        mean=0.0, std=0.01)),
                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
            cls_subnet.append(
                nn.BatchNorm2D(
                    self.feat_out,
                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
            cls_subnet.append(nn.ReLU())

        for i in range(self.num_reg_convs):
            feat_in = self.feat_in if i == 0 else self.feat_out
            reg_subnet.append(
                nn.Conv2D(
                    feat_in,
                    self.feat_out,
                    3,
                    stride=1,
                    padding=1,
                    weight_attr=ParamAttr(initializer=Normal(
                        mean=0.0, std=0.01)),
                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
            reg_subnet.append(
                nn.BatchNorm2D(
                    self.feat_out,
                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
            reg_subnet.append(nn.ReLU())

        self.cls_subnet = nn.Sequential(*cls_subnet)
        self.reg_subnet = nn.Sequential(*reg_subnet)

    def forward(self, fpn_feat):
        cls_feat = self.cls_subnet(fpn_feat)
        reg_feat = self.reg_subnet(fpn_feat)
        return cls_feat, reg_feat


@register
class YOLOFHead(nn.Layer):
    __shared__ = ['num_classes', 'trt', 'exclude_nms']
    __inject__ = [
        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
        'loss_bbox', 'nms'
    ]

    def __init__(self,
                 num_classes=80,
                 conv_feat='YOLOFFeat',
                 anchor_generator='AnchorGenerator',
                 bbox_assigner='UniformAssigner',
                 loss_class='FocalLoss',
                 loss_bbox='GIoULoss',
                 ctr_clip=32.0,
                 delta_mean=[0.0, 0.0, 0.0, 0.0],
                 delta_std=[1.0, 1.0, 1.0, 1.0],
                 nms='MultiClassNMS',
                 prior_prob=0.01,
                 nms_pre=1000,
                 use_inside_anchor=False,
                 trt=False,
                 exclude_nms=False):
        super(YOLOFHead, self).__init__()
        self.num_classes = num_classes
        self.conv_feat = conv_feat
        self.anchor_generator = anchor_generator
        self.na = self.anchor_generator.num_anchors
        self.bbox_assigner = bbox_assigner
        self.loss_class = loss_class
        self.loss_bbox = loss_bbox
        self.ctr_clip = ctr_clip
        self.delta_mean = delta_mean
        self.delta_std = delta_std
        self.nms = nms
        self.nms_pre = nms_pre
        self.use_inside_anchor = use_inside_anchor
        if isinstance(self.nms, MultiClassNMS) and trt:
            self.nms.trt = trt
        self.exclude_nms = exclude_nms

        bias_init_value = -math.log((1 - prior_prob) / prior_prob)
        self.cls_score = self.add_sublayer(
            'cls_score',
            nn.Conv2D(
                in_channels=conv_feat.feat_out,
                out_channels=self.num_classes * self.na,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(
                    value=bias_init_value))))

        self.bbox_pred = self.add_sublayer(
            'bbox_pred',
            nn.Conv2D(
                in_channels=conv_feat.feat_out,
                out_channels=4 * self.na,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

        self.object_pred = self.add_sublayer(
            'object_pred',
            nn.Conv2D(
                in_channels=conv_feat.feat_out,
                out_channels=self.na,
                kernel_size=3,
                stride=1,
                padding=1,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0.0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(value=0))))

    def forward(self, feats, targets=None):
        assert len(feats) == 1, "YOLOF only has one level feature."
        conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0])
        cls_logits = self.cls_score(conv_cls_feat)
        objectness = self.object_pred(conv_reg_feat)
        bboxes_reg = self.bbox_pred(conv_reg_feat)

        N, C, H, W = cls_logits.shape[:]
        cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W))
        objectness = objectness.reshape((N, self.na, 1, H, W))
        norm_cls_logits = cls_logits + objectness - paddle.log(
            1.0 + paddle.clip(
                cls_logits.exp(), max=INF) + paddle.clip(
                    objectness.exp(), max=INF))
        norm_cls_logits = norm_cls_logits.reshape((N, C, H, W))

        anchors = self.anchor_generator([norm_cls_logits])

        if self.training:
            yolof_losses = self.get_loss(
                [anchors[0], norm_cls_logits, bboxes_reg], targets)
            return yolof_losses
        else:
            return anchors[0], norm_cls_logits, bboxes_reg

    def get_loss(self, head_outs, targets):
        anchors, cls_logits, bbox_preds = head_outs

        feat_size = cls_logits.shape[-2:]
        cls_logits = cls_logits.transpose([0, 2, 3, 1])
        cls_logits = cls_logits.reshape([0, -1, self.num_classes])
        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
        bbox_preds = bbox_preds.reshape([0, -1, 4])

        num_pos_list = []
        cls_pred_list, cls_tar_list = [], []
        reg_pred_list, reg_tar_list = [], []
        # find and gather preds and targets in each image
        for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip(
                cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'],
                targets['im_shape']):
            if self.use_inside_anchor:
                inside_mask = find_inside_anchor(
                    feat_size, self.anchor_generator.strides[0], self.na,
                    im_shape.tolist())
                cls_logit = cls_logit[inside_mask]
                bbox_pred = bbox_pred[inside_mask]
                anchors = anchors[inside_mask]

            bbox_pred = delta2bbox_v2(
                bbox_pred,
                anchors,
                self.delta_mean,
                self.delta_std,
                ctr_clip=self.ctr_clip)
            bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])

            # -2:ignore, -1:neg, >=0:pos
            match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner(
                bbox_pred, anchors, gt_bbox)
            pos_mask = (match_labels >= 0)
            neg_mask = (match_labels == -1)
            chosen_mask = paddle.logical_or(pos_mask, neg_mask)

            gt_class = gt_class.reshape([-1])
            bg_class = paddle.to_tensor(
                [self.num_classes], dtype=gt_class.dtype)
            # a trick to assign num_classes to negative targets
            gt_class = paddle.concat([gt_class, bg_class], axis=-1)
            match_labels = paddle.where(
                neg_mask,
                paddle.full_like(match_labels, gt_class.size - 1), match_labels)
            num_pos_list.append(max(1.0, pos_mask.sum().item()))

            cls_pred_list.append(cls_logit[chosen_mask])
            cls_tar_list.append(gt_class[match_labels[chosen_mask]])
            reg_pred_list.append(pos_bbox_pred)
            reg_tar_list.append(pos_bbox_tar)

        num_tot_pos = paddle.to_tensor(sum(num_pos_list))
        num_tot_pos = reduce_mean(num_tot_pos).item()
        num_tot_pos = max(1.0, num_tot_pos)

        cls_pred = paddle.concat(cls_pred_list)
        cls_tar = paddle.concat(cls_tar_list)
        cls_loss = self.loss_class(
            cls_pred, cls_tar, reduction='sum') / num_tot_pos

        reg_pred_list = [_ for _ in reg_pred_list if _ is not None]
        reg_tar_list = [_ for _ in reg_tar_list if _ is not None]
        if len(reg_pred_list) == 0:
            reg_loss = bbox_preds.sum() * 0.0
        else:
            reg_pred = paddle.concat(reg_pred_list)
            reg_tar = paddle.concat(reg_tar_list)
            reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos

        yolof_losses = {
            'loss': cls_loss + reg_loss,
            'loss_cls': cls_loss,
            'loss_reg': reg_loss,
        }
        return yolof_losses

    def get_bboxes_single(self,
                          anchors,
                          cls_scores,
                          bbox_preds,
                          im_shape,
                          scale_factor,
                          rescale=True):
        assert len(cls_scores) == len(bbox_preds)
        mlvl_bboxes = []
        mlvl_scores = []
        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores,
                                                bbox_preds):
            cls_score = cls_score.reshape([-1, self.num_classes])
            bbox_pred = bbox_pred.reshape([-1, 4])
            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
                max_score = cls_score.max(axis=1)
                _, topk_inds = max_score.topk(self.nms_pre)
                bbox_pred = bbox_pred.gather(topk_inds)
                anchor = anchor.gather(topk_inds)
                cls_score = cls_score.gather(topk_inds)

            bbox_pred = delta2bbox_v2(
                bbox_pred,
                anchor,
                self.delta_mean,
                self.delta_std,
                max_shape=im_shape,
                ctr_clip=self.ctr_clip).squeeze()
            mlvl_bboxes.append(bbox_pred)
            mlvl_scores.append(F.sigmoid(cls_score))
        mlvl_bboxes = paddle.concat(mlvl_bboxes)
        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
        if rescale:
            mlvl_bboxes = mlvl_bboxes / paddle.concat(
                [scale_factor[::-1], scale_factor[::-1]])
        mlvl_scores = paddle.concat(mlvl_scores)
        mlvl_scores = mlvl_scores.transpose([1, 0])
        return mlvl_bboxes, mlvl_scores

    def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor):
        batch_bboxes = []
        batch_scores = []
        for img_id in range(cls_scores[0].shape[0]):
            num_lvls = len(cls_scores)
            cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)]
            bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)]
            bboxes, scores = self.get_bboxes_single(
                anchors, cls_score_list, bbox_pred_list, im_shape[img_id],
                scale_factor[img_id])
            batch_bboxes.append(bboxes)
            batch_scores.append(scores)
        batch_bboxes = paddle.stack(batch_bboxes, 0)
        batch_scores = paddle.stack(batch_scores, 0)
        return batch_bboxes, batch_scores

    def post_process(self, head_outs, im_shape, scale_factor):
        anchors, cls_scores, bbox_preds = head_outs
        cls_scores = cls_scores.transpose([0, 2, 3, 1])
        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
        pred_bboxes, pred_scores = self.decode(
            [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor)

        if self.exclude_nms:
            # `exclude_nms=True` just use in benchmark
            return pred_bboxes.sum(), pred_scores.sum()
        else:
            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
            return bbox_pred, bbox_num


================================================
FILE: ppdet/modeling/initializer.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
"""

import math
import numpy as np

import paddle
import paddle.nn as nn

__all__ = [
    'uniform_',
    'normal_',
    'constant_',
    'ones_',
    'zeros_',
    'xavier_uniform_',
    'xavier_normal_',
    'kaiming_uniform_',
    'kaiming_normal_',
    'linear_init_',
    'conv_init_',
    'reset_initialized_parameter',
]


def _no_grad_uniform_(tensor, a, b):
    with paddle.no_grad():
        tensor.set_value(
            paddle.uniform(
                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
    return tensor


def _no_grad_normal_(tensor, mean=0., std=1.):
    with paddle.no_grad():
        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
    return tensor


def _no_grad_fill_(tensor, value=0.):
    with paddle.no_grad():
        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
    return tensor


def uniform_(tensor, a, b):
    """
    Modified tensor inspace using uniform_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        a (float|int): min value.
        b (float|int): max value.
    Return:
        tensor
    """
    return _no_grad_uniform_(tensor, a, b)


def normal_(tensor, mean=0., std=1.):
    """
    Modified tensor inspace using normal_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        mean (float|int): mean value.
        std (float|int): std value.
    Return:
        tensor
    """
    return _no_grad_normal_(tensor, mean, std)


def constant_(tensor, value=0.):
    """
    Modified tensor inspace using constant_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        value (float|int): value to fill tensor.
    Return:
        tensor
    """
    return _no_grad_fill_(tensor, value)


def ones_(tensor):
    """
    Modified tensor inspace using ones_
    Args:
        tensor (paddle.Tensor): paddle Tensor
    Return:
        tensor
    """
    return _no_grad_fill_(tensor, 1)


def zeros_(tensor):
    """
    Modified tensor inspace using zeros_
    Args:
        tensor (paddle.Tensor): paddle Tensor
    Return:
        tensor
    """
    return _no_grad_fill_(tensor, 0)


def vector_(tensor, vector):
    with paddle.no_grad():
        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
    return tensor


def _calculate_fan_in_and_fan_out(tensor, reverse=False):
    """
    Calculate (fan_in, _fan_out) for tensor

    Args:
        tensor (Tensor): paddle.Tensor
        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True

    Return:
        Tuple[fan_in, fan_out]
    """
    if tensor.ndim < 2:
        raise ValueError(
            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
        )

    if reverse:
        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
    else:
        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]

    receptive_field_size = 1
    if tensor.ndim > 2:
        receptive_field_size = np.prod(tensor.shape[2:])

    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out


def xavier_uniform_(tensor, gain=1., reverse=False):
    """
    Modified tensor inspace using xavier_uniform_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        gain (float): super parameter, 1. default.
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
    k = math.sqrt(3.0) * std
    return _no_grad_uniform_(tensor, -k, k)


def xavier_normal_(tensor, gain=1., reverse=False):
    """
    Modified tensor inspace using xavier_normal_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        gain (float): super parameter, 1. default.
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
    return _no_grad_normal_(tensor, 0, std)


# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
def _calculate_correct_fan(tensor, mode, reverse=False):
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError("Mode {} not supported, please use one of {}".format(
            mode, valid_modes))

    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)

    return fan_in if mode == 'fan_in' else fan_out


def _calculate_gain(nonlinearity, param=None):
    linear_fns = [
        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
        'conv_transpose2d', 'conv_transpose3d'
    ]
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(
                param, int) or isinstance(param, float):
            # True/False are instances of int, hence check above
            negative_slope = param
        else:
            raise ValueError("negative_slope {} not a valid number".format(
                param))
        return math.sqrt(2.0 / (1 + negative_slope**2))
    elif nonlinearity == 'selu':
        return 3.0 / 4
    else:
        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))


def kaiming_uniform_(tensor,
                     a=0,
                     mode='fan_in',
                     nonlinearity='leaky_relu',
                     reverse=False):
    """
    Modified tensor inspace using kaiming_uniform method
    Args:
        tensor (paddle.Tensor): paddle Tensor
        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
        nonlinearity (str): nonlinearity method name
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
    """
    fan = _calculate_correct_fan(tensor, mode, reverse)
    gain = _calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    k = math.sqrt(3.0) * std
    return _no_grad_uniform_(tensor, -k, k)


def kaiming_normal_(tensor,
                    a=0,
                    mode='fan_in',
                    nonlinearity='leaky_relu',
                    reverse=False):
    """
    Modified tensor inspace using kaiming_normal_
    Args:
        tensor (paddle.Tensor): paddle Tensor
        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
        nonlinearity (str): nonlinearity method name
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
    """
    fan = _calculate_correct_fan(tensor, mode, reverse)
    gain = _calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    return _no_grad_normal_(tensor, 0, std)


def linear_init_(module):
    bound = 1 / math.sqrt(module.weight.shape[0])
    uniform_(module.weight, -bound, bound)
    if hasattr(module, "bias") and module.bias is not None:
        uniform_(module.bias, -bound, bound)


def conv_init_(module):
    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
    uniform_(module.weight, -bound, bound)
    if module.bias is not None:
        uniform_(module.bias, -bound, bound)


def bias_init_with_prob(prior_prob=0.01):
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init


@paddle.no_grad()
def reset_initialized_parameter(model, include_self=True):
    """
    Reset initialized parameter using following method for [conv, linear, embedding, bn]

    Args:
        model (paddle.Layer): paddle Layer
        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
    Return:
        None
    """
    for _, m in model.named_sublayers(include_self=include_self):
        if isinstance(m, nn.Conv2D):
            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
                                    m._kernel_size[1])
            k = math.sqrt(k)
            _no_grad_uniform_(m.weight, -k, k)
            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
                _no_grad_uniform_(m.bias, -k, k)

        elif isinstance(m, nn.Linear):
            k = math.sqrt(1. / m.weight.shape[0])
            _no_grad_uniform_(m.weight, -k, k)
            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
                _no_grad_uniform_(m.bias, -k, k)

        elif isinstance(m, nn.Embedding):
            _no_grad_normal_(m.weight, mean=0., std=1.)

        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
            _no_grad_fill_(m.weight, 1.)
            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
                _no_grad_fill_(m.bias, 0)


================================================
FILE: ppdet/modeling/keypoint_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is based on https://github.com/open-mmlab/mmpose
"""

import cv2
import numpy as np
import paddle.nn.functional as F


def get_affine_mat_kernel(h, w, s, inv=False):
    if w < h:
        w_ = s
        h_ = int(np.ceil((s / w * h) / 64.) * 64)
        scale_w = w
        scale_h = h_ / w_ * w

    else:
        h_ = s
        w_ = int(np.ceil((s / h * w) / 64.) * 64)
        scale_h = h
        scale_w = w_ / h_ * h

    center = np.array([np.round(w / 2.), np.round(h / 2.)])

    size_resized = (w_, h_)
    trans = get_affine_transform(
        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)

    return trans, size_resized


def get_affine_transform(center,
                         input_size,
                         rot,
                         output_size,
                         shift=(0., 0.),
                         inv=False):
    """Get the affine transform matrix, given the center/scale/rot/output_size.

    Args:
        center (np.ndarray[2, ]): Center of the bounding box (x, y).
        input_size (np.ndarray[2, ]): Size of input feature (width, height).
        rot (float): Rotation angle (degree).
        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
        shift (0-100%): Shift translation ratio wrt the width/height.
            Default (0., 0.).
        inv (bool): Option to inverse the affine transform direction.
            (inv=False: src->dst or inv=True: dst->src)

    Returns:
        np.ndarray: The transform matrix.
    """
    assert len(center) == 2
    assert len(output_size) == 2
    assert len(shift) == 2

    if not isinstance(input_size, (np.ndarray, list)):
        input_size = np.array([input_size, input_size], dtype=np.float32)
    scale_tmp = input_size

    shift = np.array(shift)
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
    dst_dir = np.array([0., dst_w * -0.5])

    src = np.zeros((3, 2), dtype=np.float32)

    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    src[2, :] = _get_3rd_point(src[0, :], src[1, :])

    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans


def get_warp_matrix(theta, size_input, size_dst, size_target):
    """This code is based on
        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py

        Calculate the transformation matrix under the constraint of unbiased.
    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
    Data Processing for Human Pose Estimation (CVPR 2020).

    Args:
        theta (float): Rotation angle in degrees.
        size_input (np.ndarray): Size of input image [w, h].
        size_dst (np.ndarray): Size of output image [w, h].
        size_target (np.ndarray): Size of ROI in input plane [w, h].

    Returns:
        matrix (np.ndarray): A matrix for transformation.
    """
    theta = np.deg2rad(theta)
    matrix = np.zeros((2, 3), dtype=np.float32)
    scale_x = size_dst[0] / size_target[0]
    scale_y = size_dst[1] / size_target[1]
    matrix[0, 0] = np.cos(theta) * scale_x
    matrix[0, 1] = -np.sin(theta) * scale_x
    matrix[0, 2] = scale_x * (
        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
        np.sin(theta) + 0.5 * size_target[0])
    matrix[1, 0] = np.sin(theta) * scale_y
    matrix[1, 1] = np.cos(theta) * scale_y
    matrix[1, 2] = scale_y * (
        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
        np.cos(theta) + 0.5 * size_target[1])
    return matrix


def _get_3rd_point(a, b):
    """To calculate the affine matrix, three pairs of points are required. This
    function is used to get the 3rd point, given 2D points a & b.

    The 3rd point is defined by rotating vector `a - b` by 90 degrees
    anticlockwise, using b as the rotation center.

    Args:
        a (np.ndarray): point(x,y)
        b (np.ndarray): point(x,y)

    Returns:
        np.ndarray: The 3rd point.
    """
    assert len(
        a) == 2, 'input of _get_3rd_point should be point with length of 2'
    assert len(
        b) == 2, 'input of _get_3rd_point should be point with length of 2'
    direction = a - b
    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)

    return third_pt


def rotate_point(pt, angle_rad):
    """Rotate a point by an angle.

    Args:
        pt (list[float]): 2 dimensional point to be rotated
        angle_rad (float): rotation angle by radian

    Returns:
        list[float]: Rotated point.
    """
    assert len(pt) == 2
    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
    new_x = pt[0] * cs - pt[1] * sn
    new_y = pt[0] * sn + pt[1] * cs
    rotated_pt = [new_x, new_y]

    return rotated_pt


def transpred(kpts, h, w, s):
    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)

    return warp_affine_joints(kpts[..., :2].copy(), trans)


def warp_affine_joints(joints, mat):
    """Apply affine transformation defined by the transform matrix on the
    joints.

    Args:
        joints (np.ndarray[..., 2]): Origin coordinate of joints.
        mat (np.ndarray[3, 2]): The affine matrix.

    Returns:
        matrix (np.ndarray[..., 2]): Result coordinate of joints.
    """
    joints = np.array(joints)
    shape = joints.shape
    joints = joints.reshape(-1, 2)
    return np.dot(np.concatenate(
        (joints, joints[:, 0:1] * 0 + 1), axis=1),
                  mat.T).reshape(shape)


def affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]


def transform_preds(coords, center, scale, output_size):
    target_coords = np.zeros(coords.shape)
    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
    for p in range(coords.shape[0]):
        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
    return target_coords


def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
    if not isinstance(sigmas, np.ndarray):
        sigmas = np.array([
            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
            .87, .87, .89, .89
        ]) / 10.0
    vars = (sigmas * 2)**2
    xg = g[0::3]
    yg = g[1::3]
    vg = g[2::3]
    ious = np.zeros((d.shape[0]))
    for n_d in range(0, d.shape[0]):
        xd = d[n_d, 0::3]
        yd = d[n_d, 1::3]
        vd = d[n_d, 2::3]
        dx = xd - xg
        dy = yd - yg
        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
        if in_vis_thre is not None:
            ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
            e = e[ind]
        ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
    return ious


def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
    """greedily select boxes with high confidence and overlap with current maximum <= thresh
    rule out overlap >= thresh

    Args:
        kpts_db (list): The predicted keypoints within the image
        thresh (float): The threshold to select the boxes
        sigmas (np.array): The variance to calculate the oks iou
            Default: None
        in_vis_thre (float): The threshold to select the high confidence boxes
            Default: None

    Return:
        keep (list): indexes to keep
    """

    if len(kpts_db) == 0:
        return []

    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
    kpts = np.array(
        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])

    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
                          sigmas, in_vis_thre)

        inds = np.where(oks_ovr <= thresh)[0]
        order = order[inds + 1]

    return keep


def rescore(overlap, scores, thresh, type='gaussian'):
    assert overlap.shape[0] == scores.shape[0]
    if type == 'linear':
        inds = np.where(overlap >= thresh)[0]
        scores[inds] = scores[inds] * (1 - overlap[inds])
    else:
        scores = scores * np.exp(-overlap**2 / thresh)

    return scores


def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
    """greedily select boxes with high confidence and overlap with current maximum <= thresh
    rule out overlap >= thresh

    Args:
        kpts_db (list): The predicted keypoints within the image
        thresh (float): The threshold to select the boxes
        sigmas (np.array): The variance to calculate the oks iou
            Default: None
        in_vis_thre (float): The threshold to select the high confidence boxes
            Default: None

    Return:
        keep (list): indexes to keep
    """

    if len(kpts_db) == 0:
        return []

    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
    kpts = np.array(
        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])

    order = scores.argsort()[::-1]
    scores = scores[order]

    # max_dets = order.size
    max_dets = 20
    keep = np.zeros(max_dets, dtype=np.intp)
    keep_cnt = 0
    while order.size > 0 and keep_cnt < max_dets:
        i = order[0]

        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
                          sigmas, in_vis_thre)

        order = order[1:]
        scores = rescore(oks_ovr, scores[1:], thresh)

        tmp = scores.argsort()[::-1]
        order = order[tmp]
        scores = scores[tmp]

        keep[keep_cnt] = i
        keep_cnt += 1

    keep = keep[:keep_cnt]

    return keep


def resize(input,
           size=None,
           scale_factor=None,
           mode='nearest',
           align_corners=None,
           warning=True):
    if warning:
        if size is not None and align_corners:
            input_h, input_w = tuple(int(x) for x in input.shape[2:])
            output_h, output_w = tuple(int(x) for x in size)
            if output_h > input_h or output_w > output_h:
                if ((output_h > 1 and output_w > 1 and input_h > 1 and
                     input_w > 1) and (output_h - 1) % (input_h - 1) and
                    (output_w - 1) % (input_w - 1)):
                    warnings.warn(
                        f'When align_corners={align_corners}, '
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')

    return F.interpolate(input, size, scale_factor, mode, align_corners)


def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
    """Flip the flipped heatmaps back to the original form.
    Note:
        - batch_size: N
        - num_keypoints: K
        - heatmap height: H
        - heatmap width: W
    Args:
        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
            from the flipped images.
        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
            (for example, left ear -- right ear).
        target_type (str): GaussianHeatmap or CombinedTarget
    Returns:
        np.ndarray: heatmaps that flipped back to the original image
    """
    assert len(output_flipped.shape) == 4, \
        'output_flipped should be [batch_size, num_keypoints, height, width]'
    shape_ori = output_flipped.shape
    channels = 1
    if target_type.lower() == 'CombinedTarget'.lower():
        channels = 3
        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
    output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
                                             shape_ori[2], shape_ori[3]))
    output_flipped_back = output_flipped.clone()

    # Swap left-right parts
    for left, right in flip_pairs:
        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
    output_flipped_back = output_flipped_back.reshape(shape_ori)
    # Flip horizontally
    output_flipped_back = output_flipped_back[..., ::-1]
    return output_flipped_back


def _calc_distances(preds, targets, mask, normalize):
    """Calculate the normalized distances between preds and target.

    Note:
        batch_size: N
        num_keypoints: K
        dimension of keypoints: D (normally, D=2 or D=3)

    Args:
        preds (np.ndarray[N, K, D]): Predicted keypoint location.
        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
            joints, and True for visible. Invisible joints will be ignored for
            accuracy calculation.
        normalize (np.ndarray[N, D]): Typical value is heatmap_size

    Returns:
        np.ndarray[K, N]: The normalized distances. \
            If target keypoints are missing, the distance is -1.
    """
    N, K, _ = preds.shape
    # set mask=0 when normalize==0
    _mask = mask.copy()
    _mask[np.where((normalize == 0).sum(1))[0], :] = False
    distances = np.full((N, K), -1, dtype=np.float32)
    # handle invalid values
    normalize[np.where(normalize <= 0)] = 1e6
    distances[_mask] = np.linalg.norm(
        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
    return distances.T


def _distance_acc(distances, thr=0.5):
    """Return the percentage below the distance threshold, while ignoring
    distances values with -1.

    Note:
        batch_size: N
    Args:
        distances (np.ndarray[N, ]): The normalized distances.
        thr (float): Threshold of the distances.

    Returns:
        float: Percentage of distances below the threshold. \
            If all target keypoints are missing, return -1.
    """
    distance_valid = distances != -1
    num_distance_valid = distance_valid.sum()
    if num_distance_valid > 0:
        return (distances[distance_valid] < thr).sum() / num_distance_valid
    return -1


def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
    """Calculate the pose accuracy of PCK for each individual keypoint and the
    averaged accuracy across all keypoints for coordinates.

    Note:
        PCK metric measures accuracy of the localization of the body joints.
        The distances between predicted positions and the ground-truth ones
        are typically normalized by the bounding box size.
        The threshold (thr) of the normalized distance is commonly set
        as 0.05, 0.1 or 0.2 etc.

        - batch_size: N
        - num_keypoints: K

    Args:
        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
            joints, and True for visible. Invisible joints will be ignored for
            accuracy calculation.
        thr (float): Threshold of PCK calculation.
        normalize (np.ndarray[N, 2]): Normalization factor for H&W.

    Returns:
        tuple: A tuple containing keypoint accuracy.

        - acc (np.ndarray[K]): Accuracy of each keypoint.
        - avg_acc (float): Averaged accuracy across all keypoints.
        - cnt (int): Number of valid keypoints.
    """
    distances = _calc_distances(pred, gt, mask, normalize)

    acc = np.array([_distance_acc(d, thr) for d in distances])
    valid_acc = acc[acc >= 0]
    cnt = len(valid_acc)
    avg_acc = valid_acc.mean() if cnt > 0 else 0
    return acc, avg_acc, cnt


def keypoint_auc(pred, gt, mask, normalize, num_step=20):
    """Calculate the pose accuracy of PCK for each individual keypoint and the
    averaged accuracy across all keypoints for coordinates.

    Note:
        - batch_size: N
        - num_keypoints: K

    Args:
        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
            joints, and True for visible. Invisible joints will be ignored for
            accuracy calculation.
        normalize (float): Normalization factor.

    Returns:
        float: Area under curve.
    """
    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
    x = [1.0 * i / num_step for i in range(num_step)]
    y = []
    for thr in x:
        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
        y.append(avg_acc)

    auc = 0
    for i in range(num_step):
        auc += 1.0 / num_step * y[i]
    return auc


def keypoint_epe(pred, gt, mask):
    """Calculate the end-point error.

    Note:
        - batch_size: N
        - num_keypoints: K

    Args:
        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
            joints, and True for visible. Invisible joints will be ignored for
            accuracy calculation.

    Returns:
        float: Average end-point error.
    """

    normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)
    distances = _calc_distances(pred, gt, mask, normalize)
    distance_valid = distances[distances != -1]
    return distance_valid.sum() / max(1, len(distance_valid))


================================================
FILE: ppdet/modeling/lane_utils.py
================================================
import os
import cv2
import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline


class Lane:
    def __init__(self, points=None, invalid_value=-2., metadata=None):
        super(Lane, self).__init__()
        self.curr_iter = 0
        self.points = points
        self.invalid_value = invalid_value
        self.function = InterpolatedUnivariateSpline(
            points[:, 1], points[:, 0], k=min(3, len(points) - 1))
        self.min_y = points[:, 1].min() - 0.01
        self.max_y = points[:, 1].max() + 0.01
        self.metadata = metadata or {}

    def __repr__(self):
        return '[Lane]\n' + str(self.points) + '\n[/Lane]'

    def __call__(self, lane_ys):
        lane_xs = self.function(lane_ys)

        lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y
                                          )] = self.invalid_value
        return lane_xs

    def to_array(self, sample_y_range, img_w, img_h):
        self.sample_y = range(sample_y_range[0], sample_y_range[1],
                              sample_y_range[2])
        sample_y = self.sample_y
        img_w, img_h = img_w, img_h
        ys = np.array(sample_y) / float(img_h)
        xs = self(ys)
        valid_mask = (xs >= 0) & (xs < 1)
        lane_xs = xs[valid_mask] * img_w
        lane_ys = ys[valid_mask] * img_h
        lane = np.concatenate(
            (lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1)
        return lane

    def __iter__(self):
        return self

    def __next__(self):
        if self.curr_iter < len(self.points):
            self.curr_iter += 1
            return self.points[self.curr_iter - 1]
        self.curr_iter = 0
        raise StopIteration


COLORS = [
    (255, 0, 0),
    (0, 255, 0),
    (0, 0, 255),
    (255, 255, 0),
    (255, 0, 255),
    (0, 255, 255),
    (128, 255, 0),
    (255, 128, 0),
    (128, 0, 255),
    (255, 0, 128),
    (0, 128, 255),
    (0, 255, 128),
    (128, 255, 255),
    (255, 128, 255),
    (255, 255, 128),
    (60, 180, 0),
    (180, 60, 0),
    (0, 60, 180),
    (0, 180, 60),
    (60, 0, 180),
    (180, 0, 60),
    (255, 0, 0),
    (0, 255, 0),
    (0, 0, 255),
    (255, 255, 0),
    (255, 0, 255),
    (0, 255, 255),
    (128, 255, 0),
    (255, 128, 0),
    (128, 0, 255),
]


def imshow_lanes(img, lanes, show=False, out_file=None, width=4):
    lanes_xys = []
    for _, lane in enumerate(lanes):
        xys = []
        for x, y in lane:
            if x <= 0 or y <= 0:
                continue
            x, y = int(x), int(y)
            xys.append((x, y))
        lanes_xys.append(xys)
    lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0)

    for idx, xys in enumerate(lanes_xys):
        for i in range(1, len(xys)):
            cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width)

    if show:
        cv2.imshow('view', img)
        cv2.waitKey(0)

    if out_file:
        if not os.path.exists(os.path.dirname(out_file)):
            os.makedirs(os.path.dirname(out_file))
        cv2.imwrite(out_file, img)


================================================
FILE: ppdet/modeling/layers.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import six
import numpy as np
from numbers import Integral

import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle import to_tensor
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant, XavierUniform
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register, serializable
from ppdet.modeling.bbox_utils import delta2bbox
from . import ops
from .initializer import xavier_uniform_, constant_

from paddle.vision.ops import DeformConv2D


def _to_list(l):
    if isinstance(l, (list, tuple)):
        return list(l)
    return [l]


class AlignConv(nn.Layer):
    def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
        super(AlignConv, self).__init__()
        self.kernel_size = kernel_size
        self.align_conv = paddle.vision.ops.DeformConv2D(
            in_channels,
            out_channels,
            kernel_size=self.kernel_size,
            padding=(self.kernel_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
            bias_attr=None)

    @paddle.no_grad()
    def get_offset(self, anchors, featmap_size, stride):
        """
        Args:
            anchors: [B, L, 5] xc,yc,w,h,angle
            featmap_size: (feat_h, feat_w)
            stride: 8
        Returns:

        """
        batch = anchors.shape[0]
        dtype = anchors.dtype
        feat_h, feat_w = featmap_size
        pad = (self.kernel_size - 1) // 2
        idx = paddle.arange(-pad, pad + 1, dtype=dtype)

        yy, xx = paddle.meshgrid(idx, idx)
        xx = paddle.reshape(xx, [-1])
        yy = paddle.reshape(yy, [-1])

        # get sampling locations of default conv
        xc = paddle.arange(0, feat_w, dtype=dtype)
        yc = paddle.arange(0, feat_h, dtype=dtype)
        yc, xc = paddle.meshgrid(yc, xc)

        xc = paddle.reshape(xc, [-1, 1])
        yc = paddle.reshape(yc, [-1, 1])
        x_conv = xc + xx
        y_conv = yc + yy

        # get sampling locations of anchors
        x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)
        x_ctr = x_ctr / stride
        y_ctr = y_ctr / stride
        w_s = w / stride
        h_s = h / stride
        cos, sin = paddle.cos(a), paddle.sin(a)
        dw, dh = w_s / self.kernel_size, h_s / self.kernel_size
        x, y = dw * xx, dh * yy
        xr = cos * x - sin * y
        yr = sin * x + cos * y
        x_anchor, y_anchor = xr + x_ctr, yr + y_ctr
        # get offset filed
        offset_x = x_anchor - x_conv
        offset_y = y_anchor - y_conv
        offset = paddle.stack([offset_y, offset_x], axis=-1)
        offset = offset.reshape(
            [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2])
        offset = offset.transpose([0, 3, 1, 2])

        return offset

    def forward(self, x, refine_anchors, featmap_size, stride):
        batch = x.shape[0].numpy()
        offset = self.get_offset(refine_anchors, featmap_size, stride)
        if self.training:
            x = F.relu(self.align_conv(x, offset.detach()))
        else:
            x = F.relu(self.align_conv(x, offset))
        return x


class DeformableConvV2(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 weight_attr=None,
                 bias_attr=None,
                 lr_scale=1,
                 regularizer=None,
                 skip_quant=False,
                 dcn_bias_regularizer=L2Decay(0.),
                 dcn_bias_lr_scale=2.):
        super(DeformableConvV2, self).__init__()
        self.offset_channel = 2 * kernel_size**2
        self.mask_channel = kernel_size**2

        if lr_scale == 1 and regularizer is None:
            offset_bias_attr = ParamAttr(initializer=Constant(0.))
        else:
            offset_bias_attr = ParamAttr(
                initializer=Constant(0.),
                learning_rate=lr_scale,
                regularizer=regularizer)
        self.conv_offset = nn.Conv2D(
            in_channels,
            3 * kernel_size**2,
            kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2,
            weight_attr=ParamAttr(initializer=Constant(0.0)),
            bias_attr=offset_bias_attr)
        if skip_quant:
            self.conv_offset.skip_quant = True

        if bias_attr:
            # in FCOS-DCN head, specifically need learning_rate and regularizer
            dcn_bias_attr = ParamAttr(
                initializer=Constant(value=0),
                regularizer=dcn_bias_regularizer,
                learning_rate=dcn_bias_lr_scale)
        else:
            # in ResNet backbone, do not need bias
            dcn_bias_attr = False
        self.conv_dcn = DeformConv2D(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2 * dilation,
            dilation=dilation,
            groups=groups,
            weight_attr=weight_attr,
            bias_attr=dcn_bias_attr)

    def forward(self, x):
        offset_mask = self.conv_offset(x)
        offset, mask = paddle.split(
            offset_mask,
            num_or_sections=[self.offset_channel, self.mask_channel],
            axis=1)
        mask = F.sigmoid(mask)
        y = self.conv_dcn(x, offset, mask=mask)
        return y


class ConvNormLayer(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 stride,
                 groups=1,
                 norm_type='bn',
                 norm_decay=0.,
                 norm_groups=32,
                 use_dcn=False,
                 bias_on=False,
                 lr_scale=1.,
                 freeze_norm=False,
                 initializer=Normal(
                     mean=0., std=0.01),
                 skip_quant=False,
                 dcn_lr_scale=2.,
                 dcn_regularizer=L2Decay(0.)):
        super(ConvNormLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'gn', None]

        if bias_on:
            bias_attr = ParamAttr(
                initializer=Constant(value=0.), learning_rate=lr_scale)
        else:
            bias_attr = False

        if not use_dcn:
            self.conv = nn.Conv2D(
                in_channels=ch_in,
                out_channels=ch_out,
                kernel_size=filter_size,
                stride=stride,
                padding=(filter_size - 1) // 2,
                groups=groups,
                weight_attr=ParamAttr(
                    initializer=initializer, learning_rate=1.),
                bias_attr=bias_attr)
            if skip_quant:
                self.conv.skip_quant = True
        else:
            # in FCOS-DCN head, specifically need learning_rate and regularizer
            self.conv = DeformableConvV2(
                in_channels=ch_in,
                out_channels=ch_out,
                kernel_size=filter_size,
                stride=stride,
                padding=(filter_size - 1) // 2,
                groups=groups,
                weight_attr=ParamAttr(
                    initializer=initializer, learning_rate=1.),
                bias_attr=True,
                lr_scale=dcn_lr_scale,
                regularizer=dcn_regularizer,
                dcn_bias_regularizer=dcn_regularizer,
                dcn_bias_lr_scale=dcn_lr_scale,
                skip_quant=skip_quant)

        norm_lr = 0. if freeze_norm else 1.
        param_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
        bias_attr = ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
        if norm_type in ['bn', 'sync_bn']:
            self.norm = nn.BatchNorm2D(
                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
        elif norm_type == 'gn':
            self.norm = nn.GroupNorm(
                num_groups=norm_groups,
                num_channels=ch_out,
                weight_attr=param_attr,
                bias_attr=bias_attr)
        else:
            self.norm = None

    def forward(self, inputs):
        out = self.conv(inputs)
        if self.norm is not None:
            out = self.norm(out)
        return out


class LiteConv(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride=1,
                 with_act=True,
                 norm_type='sync_bn',
                 name=None):
        super(LiteConv, self).__init__()
        self.lite_conv = nn.Sequential()
        conv1 = ConvNormLayer(
            in_channels,
            in_channels,
            filter_size=5,
            stride=stride,
            groups=in_channels,
            norm_type=norm_type,
            initializer=XavierUniform())
        conv2 = ConvNormLayer(
            in_channels,
            out_channels,
            filter_size=1,
            stride=stride,
            norm_type=norm_type,
            initializer=XavierUniform())
        conv3 = ConvNormLayer(
            out_channels,
            out_channels,
            filter_size=1,
            stride=stride,
            norm_type=norm_type,
            initializer=XavierUniform())
        conv4 = ConvNormLayer(
            out_channels,
            out_channels,
            filter_size=5,
            stride=stride,
            groups=out_channels,
            norm_type=norm_type,
            initializer=XavierUniform())
        conv_list = [conv1, conv2, conv3, conv4]
        self.lite_conv.add_sublayer('conv1', conv1)
        self.lite_conv.add_sublayer('relu6_1', nn.ReLU6())
        self.lite_conv.add_sublayer('conv2', conv2)
        if with_act:
            self.lite_conv.add_sublayer('relu6_2', nn.ReLU6())
        self.lite_conv.add_sublayer('conv3', conv3)
        self.lite_conv.add_sublayer('relu6_3', nn.ReLU6())
        self.lite_conv.add_sublayer('conv4', conv4)
        if with_act:
            self.lite_conv.add_sublayer('relu6_4', nn.ReLU6())

    def forward(self, inputs):
        out = self.lite_conv(inputs)
        return out


class DropBlock(nn.Layer):
    def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):
        """
        DropBlock layer, see https://arxiv.org/abs/1810.12890

        Args:
            block_size (int): block size
            keep_prob (int): keep probability
            name (str): layer name
            data_format (str): data format, NCHW or NHWC
        """
        super(DropBlock, self).__init__()
        self.block_size = block_size
        self.keep_prob = keep_prob
        self.name = name
        self.data_format = data_format

    def forward(self, x):
        if not self.training or self.keep_prob == 1:
            return x
        else:
            gamma = (1. - self.keep_prob) / (self.block_size**2)
            if self.data_format == 'NCHW':
                shape = x.shape[2:]
            else:
                shape = x.shape[1:3]
            for s in shape:
                gamma *= s / (s - self.block_size + 1)

            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
            mask_inv = F.max_pool2d(
                matrix,
                self.block_size,
                stride=1,
                padding=self.block_size // 2,
                data_format=self.data_format)
            mask = 1. - mask_inv
            mask = mask.astype('float32')
            x = x.astype('float32')
            y = x * mask * (mask.numel() / mask.sum())
            return y


@register
@serializable
class AnchorGeneratorSSD(object):
    def __init__(self,
                 steps=[8, 16, 32, 64, 100, 300],
                 aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
                 min_ratio=15,
                 max_ratio=90,
                 base_size=300,
                 min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],
                 max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],
                 offset=0.5,
                 flip=True,
                 clip=False,
                 min_max_aspect_ratios_order=False):
        self.steps = steps
        self.aspect_ratios = aspect_ratios
        self.min_ratio = min_ratio
        self.max_ratio = max_ratio
        self.base_size = base_size
        self.min_sizes = min_sizes
        self.max_sizes = max_sizes
        self.offset = offset
        self.flip = flip
        self.clip = clip
        self.min_max_aspect_ratios_order = min_max_aspect_ratios_order

        if self.min_sizes == [] and self.max_sizes == []:
            num_layer = len(aspect_ratios)
            step = int(
                math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2
                                                                  )))
            for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1,
                                         step):
                self.min_sizes.append(self.base_size * ratio / 100.)
                self.max_sizes.append(self.base_size * (ratio + step) / 100.)
            self.min_sizes = [self.base_size * .10] + self.min_sizes
            self.max_sizes = [self.base_size * .20] + self.max_sizes

        self.num_priors = []
        for aspect_ratio, min_size, max_size in zip(
                aspect_ratios, self.min_sizes, self.max_sizes):
            if isinstance(min_size, (list, tuple)):
                self.num_priors.append(
                    len(_to_list(min_size)) + len(_to_list(max_size)))
            else:
                self.num_priors.append((len(aspect_ratio) * 2 + 1) * len(
                    _to_list(min_size)) + len(_to_list(max_size)))

    def __call__(self, inputs, image):
        boxes = []
        for input, min_size, max_size, aspect_ratio, step in zip(
                inputs, self.min_sizes, self.max_sizes, self.aspect_ratios,
                self.steps):
            box, _ = ops.prior_box(
                input=input,
                image=image,
                min_sizes=_to_list(min_size),
                max_sizes=_to_list(max_size),
                aspect_ratios=aspect_ratio,
                flip=self.flip,
                clip=self.clip,
                steps=[step, step],
                offset=self.offset,
                min_max_aspect_ratios_order=self.min_max_aspect_ratios_order)
            boxes.append(paddle.reshape(box, [-1, 4]))
        return boxes


@register
@serializable
class RCNNBox(object):
    __shared__ = ['num_classes', 'export_onnx']

    def __init__(self,
                 prior_box_var=[10., 10., 5., 5.],
                 code_type="decode_center_size",
                 box_normalized=False,
                 num_classes=80,
                 export_onnx=False):
        super(RCNNBox, self).__init__()
        self.prior_box_var = prior_box_var
        self.code_type = code_type
        self.box_normalized = box_normalized
        self.num_classes = num_classes
        self.export_onnx = export_onnx

    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
        bbox_pred = bbox_head_out[0]
        cls_prob = bbox_head_out[1]
        roi = rois[0]
        rois_num = rois[1]

        if self.export_onnx:
            onnx_rois_num_per_im = rois_num[0]
            origin_shape = paddle.expand(im_shape[0, :],
                                         [onnx_rois_num_per_im, 2])

        else:
            origin_shape_list = []
            if isinstance(roi, list):
                batch_size = len(roi)
            else:
                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])

            # bbox_pred.shape: [N, C*4]
            for idx in range(batch_size):
                rois_num_per_im = rois_num[idx]
                expand_im_shape = paddle.expand(im_shape[idx, :],
                                                [rois_num_per_im, 2])
                origin_shape_list.append(expand_im_shape)

            origin_shape = paddle.concat(origin_shape_list)

        # bbox_pred.shape: [N, C*4]
        # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
        bbox = paddle.concat(roi)
        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
        scores = cls_prob[:, :-1]

        # bbox.shape: [N, C, 4]
        # bbox.shape[1] must be equal to scores.shape[1]
        total_num = bbox.shape[0]
        bbox_dim = bbox.shape[-1]
        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])

        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
        zeros = paddle.zeros_like(origin_h)
        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
        bboxes = (bbox, rois_num)
        return bboxes, scores


@register
@serializable
class MultiClassNMS(object):
    def __init__(self,
                 score_threshold=.05,
                 nms_top_k=-1,
                 keep_top_k=100,
                 nms_threshold=.5,
                 normalized=True,
                 nms_eta=1.0,
                 return_index=False,
                 return_rois_num=True,
                 trt=False,
                 cpu=False):
        super(MultiClassNMS, self).__init__()
        self.score_threshold = score_threshold
        self.nms_top_k = nms_top_k
        self.keep_top_k = keep_top_k
        self.nms_threshold = nms_threshold
        self.normalized = normalized
        self.nms_eta = nms_eta
        self.return_index = return_index
        self.return_rois_num = return_rois_num
        self.trt = trt
        self.cpu = cpu

    def __call__(self, bboxes, score, background_label=-1):
        """
        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape 
                                         [N, M, 4], N is the batch size and M
                                         is the number of bboxes
                                      2. (List[Tensor]) bboxes and bbox_num,
                                         bboxes have shape of [M, C, 4], C
                                         is the class number and bbox_num means
                                         the number of bboxes of each batch with
                                         shape [N,] 
        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
        background_label (int): Ignore the background label; For example, RCNN
                                is num_classes and YOLO is -1. 
        """
        kwargs = self.__dict__.copy()
        if isinstance(bboxes, tuple):
            bboxes, bbox_num = bboxes
            kwargs.update({'rois_num': bbox_num})
        if background_label > -1:
            kwargs.update({'background_label': background_label})
        kwargs.pop('trt')
        kwargs.pop('cpu')

        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
        if self.trt and (int(paddle.version.major) == 0 or
                         (int(paddle.version.major) >= 2 and
                          int(paddle.version.minor) >= 3)):
            # TODO(wangxinxin08): tricky switch to run nms on tensorrt
            kwargs.update({'nms_eta': 1.1})
            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
            bbox = bbox.reshape([1, -1, 6])
            idx = paddle.nonzero(bbox[..., 0] != -1)
            bbox = paddle.gather_nd(bbox, idx)
            return bbox, bbox_num, None
        else:
            if self.cpu:
                device = paddle.device.get_device()
                paddle.set_device('cpu')
                outputs = ops.multiclass_nms(bboxes, score, **kwargs)
                paddle.set_device(device)
                return outputs
            else:
                return ops.multiclass_nms(bboxes, score, **kwargs)


@register
@serializable
class MatrixNMS(object):
    __append_doc__ = True

    def __init__(self,
                 score_threshold=.05,
                 post_threshold=.05,
                 nms_top_k=-1,
                 keep_top_k=100,
                 use_gaussian=False,
                 gaussian_sigma=2.,
                 normalized=False,
                 background_label=0):
        super(MatrixNMS, self).__init__()
        self.score_threshold = score_threshold
        self.post_threshold = post_threshold
        self.nms_top_k = nms_top_k
        self.keep_top_k = keep_top_k
        self.normalized = normalized
        self.use_gaussian = use_gaussian
        self.gaussian_sigma = gaussian_sigma
        self.background_label = background_label

    def __call__(self, bbox, score, *args):
        return ops.matrix_nms(
            bboxes=bbox,
            scores=score,
            score_threshold=self.score_threshold,
            post_threshold=self.post_threshold,
            nms_top_k=self.nms_top_k,
            keep_top_k=self.keep_top_k,
            use_gaussian=self.use_gaussian,
            gaussian_sigma=self.gaussian_sigma,
            background_label=self.background_label,
            normalized=self.normalized)


@register
@serializable
class YOLOBox(object):
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 conf_thresh=0.005,
                 downsample_ratio=32,
                 clip_bbox=True,
                 scale_x_y=1.):
        self.num_classes = num_classes
        self.conf_thresh = conf_thresh
        self.downsample_ratio = downsample_ratio
        self.clip_bbox = clip_bbox
        self.scale_x_y = scale_x_y

    def __call__(self,
                 yolo_head_out,
                 anchors,
                 im_shape,
                 scale_factor,
                 var_weight=None):
        boxes_list = []
        scores_list = []
        origin_shape = im_shape / scale_factor
        origin_shape = paddle.cast(origin_shape, 'int32')
        for i, head_out in enumerate(yolo_head_out):
            boxes, scores = paddle.vision.ops.yolo_box(
                head_out,
                origin_shape,
                anchors[i],
                self.num_classes,
                self.conf_thresh,
                self.downsample_ratio // 2**i,
                self.clip_bbox,
                scale_x_y=self.scale_x_y)
            boxes_list.append(boxes)
            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
        yolo_boxes = paddle.concat(boxes_list, axis=1)
        yolo_scores = paddle.concat(scores_list, axis=2)
        return yolo_boxes, yolo_scores


@register
@serializable
class SSDBox(object):
    def __init__(self,
                 is_normalized=True,
                 prior_box_var=[0.1, 0.1, 0.2, 0.2],
                 use_fuse_decode=False):
        self.is_normalized = is_normalized
        self.norm_delta = float(not self.is_normalized)
        self.prior_box_var = prior_box_var
        self.use_fuse_decode = use_fuse_decode

    def __call__(self,
                 preds,
                 prior_boxes,
                 im_shape,
                 scale_factor,
                 var_weight=None):
        boxes, scores = preds
        boxes = paddle.concat(boxes, axis=1)
        prior_boxes = paddle.concat(prior_boxes)
        if self.use_fuse_decode:
            output_boxes = ops.box_coder(
                prior_boxes,
                self.prior_box_var,
                boxes,
                code_type="decode_center_size",
                box_normalized=self.is_normalized)
        else:
            pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta
            pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta
            pb_x = prior_boxes[:, 0] + pb_w * 0.5
            pb_y = prior_boxes[:, 1] + pb_h * 0.5
            out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]
            out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]
            out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w
            out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h
            output_boxes = paddle.stack(
                [
                    out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2.,
                    out_y + out_h / 2.
                ],
                axis=-1)

        if self.is_normalized:
            h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)
            w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)
            im_shape = paddle.stack([w, h, w, h], axis=-1)
            output_boxes *= im_shape
        else:
            output_boxes[..., -2:] -= 1.0
        output_scores = F.softmax(paddle.concat(
            scores, axis=1)).transpose([0, 2, 1])

        return output_boxes, output_scores


@register
class TTFBox(object):
    __shared__ = ['down_ratio']

    def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):
        super(TTFBox, self).__init__()
        self.max_per_img = max_per_img
        self.score_thresh = score_thresh
        self.down_ratio = down_ratio

    def _simple_nms(self, heat, kernel=3):
        """
        Use maxpool to filter the max score, get local peaks.
        """
        pad = (kernel - 1) // 2
        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
        keep = paddle.cast(hmax == heat, 'float32')
        return heat * keep

    def _topk(self, scores):
        """
        Select top k scores and decode to get xy coordinates.
        """
        k = self.max_per_img
        shape_fm = paddle.shape(scores)
        shape_fm.stop_gradient = True
        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
        # batch size is 1
        scores_r = paddle.reshape(scores, [cat, -1])
        topk_scores, topk_inds = paddle.topk(scores_r, k)
        topk_ys = topk_inds // width
        topk_xs = topk_inds % width

        topk_score_r = paddle.reshape(topk_scores, [-1])
        topk_score, topk_ind = paddle.topk(topk_score_r, k)
        k_t = paddle.full(topk_ind.shape, k, dtype='int64')
        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')

        topk_inds = paddle.reshape(topk_inds, [-1])
        topk_ys = paddle.reshape(topk_ys, [-1, 1])
        topk_xs = paddle.reshape(topk_xs, [-1, 1])
        topk_inds = paddle.gather(topk_inds, topk_ind)
        topk_ys = paddle.gather(topk_ys, topk_ind)
        topk_xs = paddle.gather(topk_xs, topk_ind)

        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs

    def _decode(self, hm, wh, im_shape, scale_factor):
        heatmap = F.sigmoid(hm)
        heat = self._simple_nms(heatmap)
        scores, inds, clses, ys, xs = self._topk(heat)
        ys = paddle.cast(ys, 'float32') * self.down_ratio
        xs = paddle.cast(xs, 'float32') * self.down_ratio
        scores = paddle.tensor.unsqueeze(scores, [1])
        clses = paddle.tensor.unsqueeze(clses, [1])

        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
        wh = paddle.gather(wh, inds)

        x1 = xs - wh[:, 0:1]
        y1 = ys - wh[:, 1:2]
        x2 = xs + wh[:, 2:3]
        y2 = ys + wh[:, 3:4]

        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)

        scale_y = scale_factor[:, 0:1]
        scale_x = scale_factor[:, 1:2]
        scale_expand = paddle.concat(
            [scale_x, scale_y, scale_x, scale_y], axis=1)
        boxes_shape = paddle.shape(bboxes)
        boxes_shape.stop_gradient = True
        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
        bboxes = paddle.divide(bboxes, scale_expand)
        results = paddle.concat([clses, scores, bboxes], axis=1)
        # hack: append result with cls=-1 and score=1. to avoid all scores
        # are less than score_thresh which may cause error in gather.
        fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))
        fill_r = paddle.cast(fill_r, results.dtype)
        results = paddle.concat([results, fill_r])
        scores = results[:, 1]
        valid_ind = paddle.nonzero(scores > self.score_thresh)
        results = paddle.gather(results, valid_ind)
        return results, results.shape[0:1]

    def __call__(self, hm, wh, im_shape, scale_factor):
        results = []
        results_num = []
        for i in range(scale_factor.shape[0]):
            result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ],
                                       im_shape[i:i + 1, ],
                                       scale_factor[i:i + 1, ])
            results.append(result)
            results_num.append(num)
        results = paddle.concat(results, axis=0)
        results_num = paddle.concat(results_num, axis=0)
        return results, results_num


@register
@serializable
class JDEBox(object):
    __shared__ = ['num_classes']

    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
        self.num_classes = num_classes
        self.conf_thresh = conf_thresh
        self.downsample_ratio = downsample_ratio

    def generate_anchor(self, nGh, nGw, anchor_wh):
        nA = len(anchor_wh)
        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
        mesh = paddle.stack(
            (xv, yv), axis=0).cast(dtype='float32')  # 2 x nGh x nGw
        meshs = paddle.tile(mesh, [nA, 1, 1, 1])

        anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(
            int(nGh), axis=-2).repeat(
                int(nGw), axis=-1)
        anchor_offset_mesh = paddle.to_tensor(
            anchor_offset_mesh.astype(np.float32))
        # nA x 2 x nGh x nGw

        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
        anchor_mesh = paddle.transpose(anchor_mesh,
                                       [0, 2, 3, 1])  # (nA x nGh x nGw) x 4
        return anchor_mesh

    def decode_delta(self, delta, fg_anchor_list):
        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
        gx = pw * dx + px
        gy = ph * dy + py
        gw = pw * paddle.exp(dw)
        gh = ph * paddle.exp(dh)
        gx1 = gx - gw * 0.5
        gy1 = gy - gh * 0.5
        gx2 = gx + gw * 0.5
        gy2 = gy + gh * 0.5
        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)

    def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):
        anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)
        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
        pred_list = self.decode_delta(
            paddle.reshape(
                delta_map, shape=[-1, 4]),
            paddle.reshape(
                anchor_mesh, shape=[-1, 4]))
        pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])
        return pred_map

    def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):
        boxes_shape = head_out.shape  # [nB, nA*6, nGh, nGw]
        nGh, nGw = boxes_shape[-2], boxes_shape[-1]
        nB = 1  # TODO: only support bs=1 now
        boxes_list, scores_list = [], []
        for idx in range(nB):
            p = paddle.reshape(
                head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw])
            p = paddle.transpose(p, perm=[0, 2, 3, 1])  # [nA, nGh, nGw, 6]
            delta_map = p[:, :, :, :4]
            boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)
            # [nA * nGh * nGw, 4]
            boxes_list.append(boxes * stride)

            p_conf = paddle.transpose(
                p[:, :, :, 4:6], perm=[3, 0, 1, 2])  # [2, nA, nGh, nGw]
            p_conf = F.softmax(
                p_conf, axis=0)[1, :, :, :].unsqueeze(-1)  # [nA, nGh, nGw, 1]
            scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])
            scores_list.append(scores)

        boxes_results = paddle.stack(boxes_list)
        scores_results = paddle.stack(scores_list)
        return boxes_results, scores_results

    def __call__(self, yolo_head_out, anchors):
        bbox_pred_list = []
        for i, head_out in enumerate(yolo_head_out):
            stride = self.downsample_ratio // 2**i
            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
            nA = len(anc_w)
            boxes, scores = self._postprocessing_by_level(nA, stride, head_out,
                                                          anchor_vec)
            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))

        yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)
        boxes_idx_over_conf_thr = paddle.nonzero(
            yolo_boxes_scores[:, :, -1] > self.conf_thresh)
        boxes_idx_over_conf_thr.stop_gradient = True

        return boxes_idx_over_conf_thr, yolo_boxes_scores


@register
@serializable
class MaskMatrixNMS(object):
    """
    Matrix NMS for multi-class masks.
    Args:
        update_threshold (float): Updated threshold of categroy score in second time.
        pre_nms_top_n (int): Number of total instance to be kept per image before NMS
        post_nms_top_n (int): Number of total instance to be kept per image after NMS.
        kernel (str):  'linear' or 'gaussian'.
        sigma (float): std in gaussian method.
    Input:
        seg_preds (Variable): shape (n, h, w), segmentation feature maps
        seg_masks (Variable): shape (n, h, w), segmentation feature maps
        cate_labels (Variable): shape (n), mask labels in descending order
        cate_scores (Variable): shape (n), mask scores in descending order
        sum_masks (Variable): a float tensor of the sum of seg_masks
    Returns:
        Variable: cate_scores, tensors of shape (n)
    """

    def __init__(self,
                 update_threshold=0.05,
                 pre_nms_top_n=500,
                 post_nms_top_n=100,
                 kernel='gaussian',
                 sigma=2.0):
        super(MaskMatrixNMS, self).__init__()
        self.update_threshold = update_threshold
        self.pre_nms_top_n = pre_nms_top_n
        self.post_nms_top_n = post_nms_top_n
        self.kernel = kernel
        self.sigma = sigma

    def _sort_score(self, scores, top_num):
        if scores.shape[0] > top_num:
            return paddle.topk(scores, top_num)[1]
        else:
            return paddle.argsort(scores, descending=True)

    def __call__(self,
                 seg_preds,
                 seg_masks,
                 cate_labels,
                 cate_scores,
                 sum_masks=None):
        # sort and keep top nms_pre
        sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)
        seg_masks = paddle.gather(seg_masks, index=sort_inds)
        seg_preds = paddle.gather(seg_preds, index=sort_inds)
        sum_masks = paddle.gather(sum_masks, index=sort_inds)
        cate_scores = paddle.gather(cate_scores, index=sort_inds)
        cate_labels = paddle.gather(cate_labels, index=sort_inds)

        seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)
        # inter.
        inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))
        n_samples = cate_labels.shape
        n_samples = paddle.to_tensor(n_samples, dtype="int32")
        # union.
        sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])
        # iou.
        iou_matrix = (inter_matrix / (
            sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix))
        iou_matrix = paddle.triu(iou_matrix, diagonal=1)
        # label_specific matrix.
        cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])
        label_matrix = paddle.cast(
            (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])),
            'float32')
        label_matrix = paddle.triu(label_matrix, diagonal=1)

        # IoU compensation
        compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)
        compensate_iou = paddle.expand(
            compensate_iou, shape=[n_samples, n_samples])
        compensate_iou = paddle.transpose(compensate_iou, [1, 0])

        # IoU decay
        decay_iou = iou_matrix * label_matrix

        # matrix nms
        if self.kernel == 'gaussian':
            decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))
            compensate_matrix = paddle.exp(-1 * self.sigma *
                                           (compensate_iou**2))
            decay_coefficient = paddle.min(decay_matrix / compensate_matrix,
                                           axis=0)
        elif self.kernel == 'linear':
            decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
            decay_coefficient = paddle.min(decay_matrix, axis=0)
        else:
            raise NotImplementedError

        # update the score.
        cate_scores = cate_scores * decay_coefficient
        y = paddle.zeros(shape=cate_scores.shape, dtype='float32')
        keep = paddle.where(cate_scores >= self.update_threshold, cate_scores,
                            y)
        keep = paddle.nonzero(keep)
        keep = paddle.squeeze(keep, axis=[1])
        # Prevent empty and increase fake data
        keep = paddle.concat(
            [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')])

        seg_preds = paddle.gather(seg_preds, index=keep)
        cate_scores = paddle.gather(cate_scores, index=keep)
        cate_labels = paddle.gather(cate_labels, index=keep)

        # sort and keep top_k
        sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)
        seg_preds = paddle.gather(seg_preds, index=sort_inds)
        cate_scores = paddle.gather(cate_scores, index=sort_inds)
        cate_labels = paddle.gather(cate_labels, index=sort_inds)
        return seg_preds, cate_scores, cate_labels


def Conv2d(in_channels,
           out_channels,
           kernel_size,
           stride=1,
           padding=0,
           dilation=1,
           groups=1,
           bias=True,
           weight_init=Normal(std=0.001),
           bias_init=Constant(0.)):
    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
    if bias:
        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
    else:
        bias_attr = False
    conv = nn.Conv2D(
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        dilation,
        groups,
        weight_attr=weight_attr,
        bias_attr=bias_attr)
    return conv


def ConvTranspose2d(in_channels,
                    out_channels,
                    kernel_size,
                    stride=1,
                    padding=0,
                    output_padding=0,
                    groups=1,
                    bias=True,
                    dilation=1,
                    weight_init=Normal(std=0.001),
                    bias_init=Constant(0.)):
    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
    if bias:
        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
    else:
        bias_attr = False
    conv = nn.Conv2DTranspose(
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        dilation,
        groups,
        weight_attr=weight_attr,
        bias_attr=bias_attr)
    return conv


def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
    if not affine:
        weight_attr = False
        bias_attr = False
    else:
        weight_attr = None
        bias_attr = None
    batchnorm = nn.BatchNorm2D(
        num_features,
        momentum,
        eps,
        weight_attr=weight_attr,
        bias_attr=bias_attr)
    return batchnorm


def ReLU():
    return nn.ReLU()


def Upsample(scale_factor=None, mode='nearest', align_corners=False):
    return nn.Upsample(None, scale_factor, mode, align_corners)


def MaxPool(kernel_size, stride, padding, ceil_mode=False):
    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)


class Concat(nn.Layer):
    def __init__(self, dim=0):
        super(Concat, self).__init__()
        self.dim = dim

    def forward(self, inputs):
        return paddle.concat(inputs, axis=self.dim)

    def extra_repr(self):
        return 'dim={}'.format(self.dim)


def _convert_attention_mask(attn_mask, dtype):
    """
    Convert the attention mask to the target dtype we expect.
    Parameters:
        attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False` 
                values and the others have `True` values. When the data type is 
                int, the unwanted positions have 0 values and the others have 1 
                values. When the data type is float, the unwanted positions have 
                `-INF` values and the others have 0 values. It can be None when 
                nothing wanted or needed to be prevented attention to. Default None.
        dtype (VarType): The target type of `attn_mask` we expect.
    Returns:
        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
    """
    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)


@register
class MultiHeadAttention(nn.Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
        dropout (float, optional): The dropout probability used on attention
            weights to drop some attention targets. 0 for no dropout. Default 0
        kdim (int, optional): The feature size in key. If None, assumed equal to
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Default False.

    Examples:

        .. code-block:: python

            import paddle

            # encoder input: [batch_size, sequence_length, d_model]
            query = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, num_heads, query_len, query_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        if self._qkv_same_embed_dim:
            self.in_proj_weight = self.create_parameter(
                shape=[embed_dim, 3 * embed_dim],
                attr=None,
                dtype=self._dtype,
                is_bias=False)
            self.in_proj_bias = self.create_parameter(
                shape=[3 * embed_dim],
                attr=None,
                dtype=self._dtype,
                is_bias=True)
        else:
            self.q_proj = nn.Linear(embed_dim, embed_dim)
            self.k_proj = nn.Linear(self.kdim, embed_dim)
            self.v_proj = nn.Linear(self.vdim, embed_dim)

        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self._type_list = ('q_proj', 'k_proj', 'v_proj')

        self._reset_parameters()

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
            else:
                constant_(p)

    def compute_qkv(self, tensor, index):
        if self._qkv_same_embed_dim:
            tensor = F.linear(
                x=tensor,
                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
                                           * self.embed_dim],
                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
                                       self.embed_dim]
                if self.in_proj_bias is not None else None)
        else:
            tensor = getattr(self, self._type_list[index])(tensor)
        tensor = tensor.reshape(
            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
        return tensor

    def forward(self, query, key=None, value=None, attn_mask=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor, optional): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`. Default None.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`. Default None.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `query`, representing attention output. Or a tuple if \
                `need_weights` is True or `cache` is not None. If `need_weights` \
                is True, except for attention output, the tuple also includes \
                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
                If `cache` is not None, the tuple then includes the new cache \
                having the same type as `cache`, and if it is `StaticCache`, it \
                is same as the input `cache`, if it is `Cache`, the new cache \
                reserves tensors concatanating raw tensors with intermediate \
                results of current query.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        q, k, v = (self.compute_qkv(t, i)
                   for i, t in enumerate([query, key, value]))

        # scale dot product attention
        product = paddle.matmul(x=q, y=k, transpose_y=True)
        scaling = float(self.head_dim)**-0.5
        product = product * scaling

        if attn_mask is not None:
            # Support bool or int mask
            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
            product = product + attn_mask
        weights = F.softmax(product)
        if self.dropout:
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")
        out = paddle.matmul(weights, v)

        # combine heads
        out = paddle.transpose(out, perm=[0, 2, 1, 3])
        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        return out if len(outs) == 1 else tuple(outs)


@register
class ConvMixer(nn.Layer):
    def __init__(
            self,
            dim,
            depth,
            kernel_size=3, ):
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.kernel_size = kernel_size

        self.mixer = self.conv_mixer(dim, depth, kernel_size)

    def forward(self, x):
        return self.mixer(x)

    @staticmethod
    def conv_mixer(
            dim,
            depth,
            kernel_size, ):
        Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
        Residual = type('Residual', (Seq, ),
                        {'forward': lambda self, x: self[0](x) + x})
        return Seq(*[
            Seq(Residual(
                ActBn(
                    nn.Conv2D(
                        dim, dim, kernel_size, groups=dim, padding="same"))),
                ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth)
        ])


================================================
FILE: ppdet/modeling/losses/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import yolo_loss
from . import iou_aware_loss
from . import iou_loss
from . import ssd_loss
from . import fcos_loss
from . import solov2_loss
from . import ctfocal_loss
from . import keypoint_loss
from . import jde_loss
from . import fairmot_loss
from . import gfocal_loss
from . import detr_loss
from . import sparsercnn_loss
from . import focal_loss
from . import smooth_l1_loss
from . import probiou_loss
from . import cot_loss
from . import supcontrast
from . import queryinst_loss
from . import clrnet_loss
from . import clrnet_line_iou_loss

from .yolo_loss import *
from .iou_aware_loss import *
from .iou_loss import *
from .ssd_loss import *
from .fcos_loss import *
from .solov2_loss import *
from .ctfocal_loss import *
from .keypoint_loss import *
from .jde_loss import *
from .fairmot_loss import *
from .gfocal_loss import *
from .detr_loss import *
from .sparsercnn_loss import *
from .focal_loss import *
from .smooth_l1_loss import *
from .pose3d_loss import *
from .probiou_loss import *
from .cot_loss import *
from .supcontrast import *
from .queryinst_loss import *
from .clrnet_loss import *
from .clrnet_line_iou_loss import *

================================================
FILE: ppdet/modeling/losses/clrnet_line_iou_loss.py
================================================
import paddle


def line_iou(pred, target, img_w, length=15, aligned=True):
    '''
    Calculate the line iou value between predictions and targets
    Args:
        pred: lane predictions, shape: (num_pred, 72)
        target: ground truth, shape: (num_target, 72)
        img_w: image width
        length: extended radius
        aligned: True for iou loss calculation, False for pair-wise ious in assign
    '''
    px1 = pred - length
    px2 = pred + length
    tx1 = target - length
    tx2 = target + length

    if aligned:
        invalid_mask = target
        ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1)
        union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1)
    else:
        num_pred = pred.shape[0]
        invalid_mask = target.tile([num_pred, 1, 1])

        ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum(
            px1[:, None, :], tx1[None, ...]))
        union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) -
                 paddle.minimum(px1[:, None, :], tx1[None, ...]))

    invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w)

    ovr[invalid_masks] = 0.
    union[invalid_masks] = 0.
    iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9)
    return iou


def liou_loss(pred, target, img_w, length=15):
    return (1 - line_iou(pred, target, img_w, length)).mean()


================================================
FILE: ppdet/modeling/losses/clrnet_loss.py
================================================
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.clrnet_utils import accuracy
from ppdet.modeling.assigners.clrnet_assigner import assign
from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss

__all__ = ['CLRNetLoss']


class SoftmaxFocalLoss(nn.Layer):
    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
        super(SoftmaxFocalLoss, self).__init__()
        self.gamma = gamma
        self.nll = nn.NLLLoss(ignore_index=ignore_lb)

    def forward(self, logits, labels):
        scores = F.softmax(logits, dim=1)
        factor = paddle.pow(1. - scores, self.gamma)
        log_score = F.log_softmax(logits, dim=1)
        log_score = factor * log_score
        loss = self.nll(log_score, labels)
        return loss


def focal_loss(input: paddle.Tensor,
               target: paddle.Tensor,
               alpha: float,
               gamma: float=2.0,
               reduction: str='none',
               eps: float=1e-8) -> paddle.Tensor:
    r"""Function that computes Focal loss.

    See :class:`~kornia.losses.FocalLoss` for details.
    """
    if not paddle.is_tensor(input):
        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
            type(input)))

    if not len(input.shape) >= 2:
        raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format(
            input.shape))

    if input.shape[0] != target.shape[0]:
        raise ValueError(
            'Expected input batch_size ({}) to match target batch_size ({}).'.
            format(input.shape[0], target.shape[0]))

    n = input.shape[0]
    out_size = (n, ) + tuple(input.shape[2:])
    if target.shape[1:] != input.shape[2:]:
        raise ValueError('Expected target size {}, got {}'.format(out_size,
                                                                  target.shape))
    if (isinstance(input.place, paddle.CUDAPlace) and
            isinstance(target.place, paddle.CPUPlace)) | (isinstance(
                input.place, paddle.CPUPlace) and isinstance(target.place,
                                                             paddle.CUDAPlace)):
        raise ValueError(
            "input and target must be in the same device. Got: {} and {}".
            format(input.place, target.place))

    # compute softmax over the classes axis
    input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps

    # create the labels one hot tensor
    target_one_hot: paddle.Tensor = paddle.to_tensor(
        F.one_hot(
            target, num_classes=input.shape[1]).cast(input.dtype),
        place=input.place)

    # compute the actual focal loss
    weight = paddle.pow(-input_soft + 1., gamma)

    focal = -alpha * weight * paddle.log(input_soft)
    loss_tmp = paddle.sum(target_one_hot * focal, axis=1)

    if reduction == 'none':
        loss = loss_tmp
    elif reduction == 'mean':
        loss = paddle.mean(loss_tmp)
    elif reduction == 'sum':
        loss = paddle.sum(loss_tmp)
    else:
        raise NotImplementedError("Invalid reduction mode: {}".format(
            reduction))
    return loss


class FocalLoss(nn.Layer):
    r"""Criterion that computes Focal loss.

    According to [1], the Focal loss is computed as follows:

    .. math::

        \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t)

    where:
       - :math:`p_t` is the model's estimated probability for each class.


    Arguments:
        alpha (float): Weighting factor :math:`\alpha \in [0, 1]`.
        gamma (float): Focusing parameter :math:`\gamma >= 0`.
        reduction (str, optional): Specifies the reduction to apply to the
         output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied,
         ‘mean’: the sum of the output will be divided by the number of elements
         in the output, ‘sum’: the output will be summed. Default: ‘none’.

    Shape:
        - Input: :math:`(N, C, *)` where C = number of classes.
        - Target: :math:`(N, *)` where each value is
          :math:`0 ≤ targets[i] ≤ C−1`.

    Examples:
        >>> N = 5  # num_classes
        >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'}
        >>> loss = kornia.losses.FocalLoss(**kwargs)
        >>> input = torch.randn(1, N, 3, 5, requires_grad=True)
        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
        >>> output = loss(input, target)
        >>> output.backward()

    References:
        [1] https://arxiv.org/abs/1708.02002
    """

    def __init__(self, alpha: float, gamma: float=2.0,
                 reduction: str='none') -> None:
        super(FocalLoss, self).__init__()
        self.alpha: float = alpha
        self.gamma: float = gamma
        self.reduction: str = reduction
        self.eps: float = 1e-6

    def forward(  # type: ignore
            self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
        return focal_loss(input, target, self.alpha, self.gamma, self.reduction,
                          self.eps)


@register
class CLRNetLoss(nn.Layer):
    __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points']

    def __init__(self,
                 cls_loss_weight=2.0,
                 xyt_loss_weight=0.2,
                 iou_loss_weight=2.0,
                 seg_loss_weight=1.0,
                 refine_layers=3,
                 num_points=72,
                 img_w=800,
                 img_h=320,
                 num_classes=5,
                 ignore_label=255,
                 bg_weight=0.4):
        super(CLRNetLoss, self).__init__()
        self.cls_loss_weight = cls_loss_weight
        self.xyt_loss_weight = xyt_loss_weight
        self.iou_loss_weight = iou_loss_weight
        self.seg_loss_weight = seg_loss_weight
        self.refine_layers = refine_layers
        self.img_w = img_w
        self.img_h = img_h
        self.n_strips = num_points - 1
        self.num_classes = num_classes
        self.ignore_label = ignore_label
        weights = paddle.ones(shape=[self.num_classes])
        weights[0] = bg_weight
        self.criterion = nn.NLLLoss(
            ignore_index=self.ignore_label, weight=weights)

    def forward(self, output, batch):
        predictions_lists = output['predictions_lists']
        targets = batch['lane_line'].clone()
        cls_criterion = FocalLoss(alpha=0.25, gamma=2.0)
        cls_loss = paddle.to_tensor(0.0)
        reg_xytl_loss = paddle.to_tensor(0.0)
        iou_loss = paddle.to_tensor(0.0)
        cls_acc = []
        cls_acc_stage = []
        for stage in range(self.refine_layers):
            predictions_list = predictions_lists[stage]
            for predictions, target in zip(predictions_list, targets):
                target = target[target[:, 1] == 1]

                if len(target) == 0:
                    # If there are no targets, all predictions have to be negatives (i.e., 0 confidence)
                    cls_target = paddle.zeros(
                        [predictions.shape[0]], dtype='int64')
                    cls_pred = predictions[:, :2]
                    cls_loss = cls_loss + cls_criterion(cls_pred,
                                                        cls_target).sum()
                    continue

                with paddle.no_grad():
                    matched_row_inds, matched_col_inds = assign(
                        predictions, target, self.img_w, self.img_h)

                # classification targets
                cls_target = paddle.zeros([predictions.shape[0]], dtype='int64')
                cls_target[matched_row_inds] = 1
                cls_pred = predictions[:, :2]

                # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs
                reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6]

                reg_yxtl[:, 0] *= self.n_strips
                reg_yxtl[:, 1] *= (self.img_w - 1)
                reg_yxtl[:, 2] *= 180
                reg_yxtl[:, 3] *= self.n_strips

                target_yxtl = target.index_select(matched_col_inds)[..., 2:
                                                                    6].clone()

                # regression targets -> S coordinates (all transformed to absolute values)
                reg_pred = predictions.index_select(matched_row_inds)[..., 6:]
                reg_pred *= (self.img_w - 1)
                reg_targets = target.index_select(matched_col_inds)[...,
                                                                    6:].clone()

                with paddle.no_grad():
                    predictions_starts = paddle.clip(
                        (predictions.index_select(matched_row_inds)[..., 2] *
                         self.n_strips).round().cast("int64"),
                        min=0,
                        max=self.
                        n_strips)  # ensure the predictions starts is valid

                    target_starts = (
                        target.index_select(matched_col_inds)[..., 2] *
                        self.n_strips).round().cast("int64")
                    target_yxtl[:, -1] -= (
                        predictions_starts - target_starts)  # reg length

                # Loss calculation
                cls_loss = cls_loss + cls_criterion(
                    cls_pred, cls_target).sum() / target.shape[0]

                target_yxtl[:, 0] *= self.n_strips
                target_yxtl[:, 2] *= 180

                reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss(
                    input=reg_yxtl, label=target_yxtl, reduction='none').mean()

                iou_loss = iou_loss + liou_loss(
                    reg_pred, reg_targets, self.img_w, length=15)

                cls_accuracy = accuracy(cls_pred, cls_target)
                cls_acc_stage.append(cls_accuracy)

            cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5))

        # extra segmentation loss
        seg_loss = self.criterion(
            F.log_softmax(
                output['seg'], axis=1), batch['seg'].cast('int64'))

        cls_loss /= (len(targets) * self.refine_layers)
        reg_xytl_loss /= (len(targets) * self.refine_layers)
        iou_loss /= (len(targets) * self.refine_layers)

        loss = cls_loss * self.cls_loss_weight \
            + reg_xytl_loss * self.xyt_loss_weight \
            + seg_loss * self.seg_loss_weight \
            + iou_loss * self.iou_loss_weight

        return_value = {
            'loss': loss,
            'cls_loss': cls_loss * self.cls_loss_weight,
            'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight,
            'seg_loss': seg_loss * self.seg_loss_weight,
            'iou_loss': iou_loss * self.iou_loss_weight
        }

        for i in range(self.refine_layers):
            if not isinstance(cls_acc[i], paddle.Tensor):
                cls_acc[i] = paddle.to_tensor(cls_acc[i])
            return_value['stage_{}_acc'.format(i)] = cls_acc[i]

        return return_value


================================================
FILE: ppdet/modeling/losses/cot_loss.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from ppdet.core.workspace import register

__all__ = ['COTLoss']

@register
class COTLoss(nn.Layer):
    __shared__ = ['num_classes']
    def __init__(self,
                 num_classes=80, 
                 cot_scale=1,
                 cot_lambda=1):
        super(COTLoss, self).__init__()
        self.cot_scale = cot_scale
        self.cot_lambda = cot_lambda    
        self.num_classes = num_classes    
        
    def forward(self, scores, targets, cot_relation):    
        cls_name = 'loss_bbox_cls_cot'
        loss_bbox = {}

        tgt_labels, tgt_bboxes, tgt_gt_inds = targets
        tgt_labels = paddle.concat(tgt_labels) if len(
            tgt_labels) > 1 else tgt_labels[0]
        mask = (tgt_labels < self.num_classes)
        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
        if valid_inds.shape[0] == 0:
            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
        else:
            tgt_labels = tgt_labels.cast('int64')
            valid_cot_targets = []
            for i in range(tgt_labels.shape[0]):
                train_label = tgt_labels[i]
                if train_label < self.num_classes:
                    valid_cot_targets.append(cot_relation[train_label])
            coco_targets = paddle.to_tensor(valid_cot_targets)
            coco_targets.stop_gradient = True
            coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale)
            loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1))
        return loss_bbox


================================================
FILE: ppdet/modeling/losses/ctfocal_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle

from ppdet.core.workspace import register, serializable

__all__ = ['CTFocalLoss']


@register
@serializable
class CTFocalLoss(object):
    """
    CTFocalLoss: CornerNet & CenterNet Focal Loss
    Args:
        loss_weight (float): loss weight
        gamma (float): gamma parameter for Focal Loss
    """

    def __init__(self, loss_weight=1., gamma=2.0):
        self.loss_weight = loss_weight
        self.gamma = gamma

    def __call__(self, pred, target):
        """
        Calculate the loss
        Args:
            pred (Tensor): heatmap prediction
            target (Tensor): target for positive samples
        Return:
            ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet.
                Note that the values in target are in [0, 1] since gaussian is
                used to reduce the punishment and we treat [0, 1) as neg example.
        """
        fg_map = paddle.cast(target == 1, 'float32')
        fg_map.stop_gradient = True
        bg_map = paddle.cast(target < 1, 'float32')
        bg_map.stop_gradient = True

        neg_weights = paddle.pow(1 - target, 4)
        pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred,
                                                     self.gamma) * fg_map

        neg_loss = 0 - paddle.log(1 - pred) * paddle.pow(
            pred, self.gamma) * neg_weights * bg_map
        pos_loss = paddle.sum(pos_loss)
        neg_loss = paddle.sum(neg_loss)

        fg_num = paddle.sum(fg_map)
        ct_focal_loss = (pos_loss + neg_loss) / (
            fg_num + paddle.cast(fg_num == 0, 'float32'))
        return ct_focal_loss * self.loss_weight


================================================
FILE: ppdet/modeling/losses/detr_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .iou_loss import GIoULoss
from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
from ..bbox_utils import bbox_iou

__all__ = ['DETRLoss', 'DINOLoss', 'DINOv3Loss']


@register
class DETRLoss(nn.Layer):
    __shared__ = ['num_classes', 'use_focal_loss']
    __inject__ = ['matcher']

    def __init__(self,
                 num_classes=80,
                 matcher='HungarianMatcher',
                 loss_coeff={
                     'class': 1,
                     'bbox': 5,
                     'giou': 2,
                     'no_object': 0.1,
                     'mask': 1,
                     'dice': 1
                 },
                 aux_loss=True,
                 use_focal_loss=False,
                 use_vfl=False,
                 vfl_iou_type='bbox',
                 use_uni_match=False,
                 uni_match_ind=0):
        r"""
        Args:
            num_classes (int): The number of classes.
            matcher (HungarianMatcher): It computes an assignment between the targets
                and the predictions of the network.
            loss_coeff (dict): The coefficient of loss.
            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
            use_focal_loss (bool): Use focal loss or not.
        """
        super(DETRLoss, self).__init__()

        self.num_classes = num_classes
        self.matcher = matcher
        self.loss_coeff = loss_coeff
        self.aux_loss = aux_loss
        self.use_focal_loss = use_focal_loss
        self.use_vfl = use_vfl
        self.vfl_iou_type = vfl_iou_type
        self.use_uni_match = use_uni_match
        self.uni_match_ind = uni_match_ind

        if not self.use_focal_loss:
            self.loss_coeff['class'] = paddle.full([num_classes + 1],
                                                   loss_coeff['class'])
            self.loss_coeff['class'][-1] = loss_coeff['no_object']
        self.giou_loss = GIoULoss()

    def _get_loss_class(self,
                        logits,
                        gt_class,
                        match_indices,
                        bg_index,
                        num_gts,
                        postfix="",
                        iou_score=None,
                        gt_score=None):
        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
        name_class = "loss_class" + postfix

        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
        bs, num_query_objects = target_label.shape
        num_gt = sum(len(a) for a in gt_class)
        if num_gt > 0:
            index, updates = self._get_index_updates(num_query_objects,
                                                     gt_class, match_indices)
            target_label = paddle.scatter(
                target_label.reshape([-1, 1]), index, updates.astype('int64'))
            target_label = target_label.reshape([bs, num_query_objects])
        if self.use_focal_loss:
            target_label = F.one_hot(target_label,
                                     self.num_classes + 1)[..., :-1]
            if iou_score is not None and self.use_vfl:
                if gt_score is not None:
                    target_score = paddle.zeros([bs, num_query_objects])
                    target_score = paddle.scatter(
                        target_score.reshape([-1, 1]), index, gt_score)
                    target_score = target_score.reshape(
                        [bs, num_query_objects, 1]) * target_label

                    target_score_iou = paddle.zeros([bs, num_query_objects])
                    target_score_iou = paddle.scatter(
                        target_score_iou.reshape([-1, 1]), index, iou_score)
                    target_score_iou = target_score_iou.reshape(
                        [bs, num_query_objects, 1]) * target_label
                    target_score = paddle.multiply(target_score,
                                                   target_score_iou)
                    loss_ = self.loss_coeff[
                        'class'] * varifocal_loss_with_logits(
                            logits, target_score, target_label,
                            num_gts / num_query_objects)
                else:
                    target_score = paddle.zeros([bs, num_query_objects])
                    if num_gt > 0:
                        target_score = paddle.scatter(
                            target_score.reshape([-1, 1]), index, iou_score)
                    target_score = target_score.reshape(
                        [bs, num_query_objects, 1]) * target_label
                    loss_ = self.loss_coeff[
                        'class'] * varifocal_loss_with_logits(
                            logits, target_score, target_label,
                            num_gts / num_query_objects)
            else:
                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
                    logits, target_label, num_gts / num_query_objects)
        else:
            loss_ = F.cross_entropy(
                logits, target_label, weight=self.loss_coeff['class'])
        return {name_class: loss_}

    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
                       postfix=""):
        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
        name_bbox = "loss_bbox" + postfix
        name_giou = "loss_giou" + postfix

        loss = dict()
        if sum(len(a) for a in gt_bbox) == 0:
            loss[name_bbox] = paddle.to_tensor([0.])
            loss[name_giou] = paddle.to_tensor([0.])
            return loss

        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
                                                            match_indices)
        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
            src_bbox, target_bbox, reduction='sum') / num_gts
        loss[name_giou] = self.giou_loss(
            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
        loss[name_giou] = loss[name_giou].sum() / num_gts
        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
        return loss

    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
                       postfix=""):
        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
        name_mask = "loss_mask" + postfix
        name_dice = "loss_dice" + postfix

        loss = dict()
        if sum(len(a) for a in gt_mask) == 0:
            loss[name_mask] = paddle.to_tensor([0.])
            loss[name_dice] = paddle.to_tensor([0.])
            return loss

        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
                                                              match_indices)
        src_masks = F.interpolate(
            src_masks.unsqueeze(0),
            size=target_masks.shape[-2:],
            mode="bilinear")[0]
        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
            src_masks,
            target_masks,
            paddle.to_tensor(
                [num_gts], dtype='float32'))
        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
            src_masks, target_masks, num_gts)
        return loss

    def _dice_loss(self, inputs, targets, num_gts):
        inputs = F.sigmoid(inputs)
        inputs = inputs.flatten(1)
        targets = targets.flatten(1)
        numerator = 2 * (inputs * targets).sum(1)
        denominator = inputs.sum(-1) + targets.sum(-1)
        loss = 1 - (numerator + 1) / (denominator + 1)
        return loss.sum() / num_gts

    def _get_loss_aux(self,
                      boxes,
                      logits,
                      gt_bbox,
                      gt_class,
                      bg_index,
                      num_gts,
                      dn_match_indices=None,
                      postfix="",
                      masks=None,
                      gt_mask=None,
                      gt_score=None):
        loss_class = []
        loss_bbox, loss_giou = [], []
        loss_mask, loss_dice = [], []
        if dn_match_indices is not None:
            match_indices = dn_match_indices
        elif self.use_uni_match:
            match_indices = self.matcher(
                boxes[self.uni_match_ind],
                logits[self.uni_match_ind],
                gt_bbox,
                gt_class,
                masks=masks[self.uni_match_ind] if masks is not None else None,
                gt_mask=gt_mask)
        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
            aux_masks = masks[i] if masks is not None else None
            if not self.use_uni_match and dn_match_indices is None:
                match_indices = self.matcher(
                    aux_boxes,
                    aux_logits,
                    gt_bbox,
                    gt_class,
                    masks=aux_masks,
                    gt_mask=gt_mask)
            if self.use_vfl:
                if sum(len(a) for a in gt_bbox) > 0:
                    src_bbox, target_bbox = self._get_src_target_assign(
                        aux_boxes.detach(), gt_bbox, match_indices)
                    iou_score = bbox_iou(
                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
                else:
                    iou_score = None
                if gt_score is not None:
                    _, target_score = self._get_src_target_assign(
                        logits[-1].detach(), gt_score, match_indices)
            else:
                iou_score = None
            loss_class.append(
                self._get_loss_class(
                    aux_logits,
                    gt_class,
                    match_indices,
                    bg_index,
                    num_gts,
                    postfix,
                    iou_score,
                    gt_score=target_score
                    if gt_score is not None else None)['loss_class' + postfix])
            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
                                        num_gts, postfix)
            loss_bbox.append(loss_['loss_bbox' + postfix])
            loss_giou.append(loss_['loss_giou' + postfix])
            if masks is not None and gt_mask is not None:
                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
                                            num_gts, postfix)
                loss_mask.append(loss_['loss_mask' + postfix])
                loss_dice.append(loss_['loss_dice' + postfix])
        loss = {
            "loss_class_aux" + postfix: paddle.add_n(loss_class),
            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
        }
        if masks is not None and gt_mask is not None:
            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
        return loss

    def _get_index_updates(self, num_query_objects, target, match_indices):
        batch_idx = paddle.concat([
            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
        ])
        src_idx = paddle.concat([src for (src, _) in match_indices])
        src_idx += (batch_idx * num_query_objects)
        target_assign = paddle.concat([
            paddle.gather(
                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
        ])
        return src_idx, target_assign

    def _get_src_target_assign(self, src, target, match_indices):
        src_assign = paddle.concat([
            paddle.gather(
                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
            for t, (I, _) in zip(src, match_indices)
        ])
        target_assign = paddle.concat([
            paddle.gather(
                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
            for t, (_, J) in zip(target, match_indices)
        ])
        return src_assign, target_assign

    def _get_num_gts(self, targets, dtype="float32"):
        num_gts = sum(len(a) for a in targets)
        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.all_reduce(num_gts)
            num_gts /= paddle.distributed.get_world_size()
        num_gts = paddle.clip(num_gts, min=1.)
        return num_gts

    def _get_prediction_loss(self,
                             boxes,
                             logits,
                             gt_bbox,
                             gt_class,
                             masks=None,
                             gt_mask=None,
                             postfix="",
                             dn_match_indices=None,
                             num_gts=1,
                             gt_score=None):
        if dn_match_indices is None:
            match_indices = self.matcher(
                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
        else:
            match_indices = dn_match_indices

        if self.use_vfl:
            if gt_score is not None:  #ssod
                _, target_score = self._get_src_target_assign(
                    logits[-1].detach(), gt_score, match_indices)
            elif sum(len(a) for a in gt_bbox) > 0:
                if self.vfl_iou_type == 'bbox':
                    src_bbox, target_bbox = self._get_src_target_assign(
                        boxes.detach(), gt_bbox, match_indices)
                    iou_score = bbox_iou(
                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
                elif self.vfl_iou_type == 'mask':
                    assert (masks is not None and gt_mask is not None,
                            'Make sure the input has `mask` and `gt_mask`')
                    assert sum(len(a) for a in gt_mask) > 0
                    src_mask, target_mask = self._get_src_target_assign(
                        masks.detach(), gt_mask, match_indices)
                    src_mask = F.interpolate(
                        src_mask.unsqueeze(0),
                        scale_factor=2,
                        mode='bilinear',
                        align_corners=False).squeeze(0)
                    target_mask = F.interpolate(
                        target_mask.unsqueeze(0),
                        size=src_mask.shape[-2:],
                        mode='bilinear',
                        align_corners=False).squeeze(0)
                    src_mask = src_mask.flatten(1)
                    src_mask = F.sigmoid(src_mask)
                    src_mask = paddle.where(
                        src_mask > 0.5, 1., 0.).astype(masks.dtype)
                    target_mask = target_mask.flatten(1)
                    target_mask = paddle.where(
                        target_mask > 0.5, 1., 0.).astype(masks.dtype)
                    inter = (src_mask * target_mask).sum(1)
                    union = src_mask.sum(1) + target_mask.sum(1) - inter
                    iou_score = (inter + 1e-2) / (union + 1e-2)
                    iou_score = iou_score.unsqueeze(-1)
                else:
                    iou_score = None
            else:
                iou_score = None
        else:
            iou_score = None

        loss = dict()
        loss.update(
            self._get_loss_class(
                logits,
                gt_class,
                match_indices,
                self.num_classes,
                num_gts,
                postfix,
                iou_score,
                gt_score=target_score if gt_score is not None else None))
        loss.update(
            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
                                postfix))
        if masks is not None and gt_mask is not None:
            loss.update(
                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
                                    postfix))
        return loss

    def forward(self,
                boxes,
                logits,
                gt_bbox,
                gt_class,
                masks=None,
                gt_mask=None,
                postfix="",
                gt_score=None,
                o2m=1,
                **kwargs):
        r"""
        Args:
            boxes (Tensor): [l, b, query, 4]
            logits (Tensor): [l, b, query, num_classes]
            gt_bbox (List(Tensor)): list[[n, 4]]
            gt_class (List(Tensor)): list[[n, 1]]
            masks (Tensor, optional): [l, b, query, h, w]
            gt_mask (List(Tensor), optional): list[[n, H, W]]
            postfix (str): postfix of loss name
        """

        dn_match_indices = kwargs.get("dn_match_indices", None)
        num_gts = kwargs.get("num_gts", None)
        if num_gts is None:
            num_gts = self._get_num_gts(gt_class)

        total_loss = self._get_prediction_loss(
            boxes[-1],
            logits[-1],
            gt_bbox,
            gt_class,
            masks=masks[-1] if masks is not None else None,
            gt_mask=gt_mask,
            postfix=postfix,
            dn_match_indices=dn_match_indices,
            num_gts=num_gts,
            gt_score=gt_score if gt_score is not None else None)

        if self.aux_loss:
            total_loss.update(
                self._get_loss_aux(
                    boxes[:-1],
                    logits[:-1],
                    gt_bbox,
                    gt_class,
                    self.num_classes,
                    num_gts,
                    dn_match_indices,
                    postfix,
                    masks=masks[:-1] if masks is not None else None,
                    gt_mask=gt_mask,
                    gt_score=gt_score if gt_score is not None else None))

        return total_loss


@register
class DINOLoss(DETRLoss):
    def forward(self,
                boxes,
                logits,
                gt_bbox,
                gt_class,
                masks=None,
                gt_mask=None,
                postfix="",
                dn_out_bboxes=None,
                dn_out_logits=None,
                dn_meta=None,
                gt_score=None,
                **kwargs):
        num_gts = self._get_num_gts(gt_class)
        total_loss = super(DINOLoss, self).forward(
            boxes,
            logits,
            gt_bbox,
            gt_class,
            num_gts=num_gts,
            gt_score=gt_score)

        if dn_meta is not None:
            dn_positive_idx, dn_num_group = \
                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
            assert len(gt_class) == len(dn_positive_idx)

            # denoising match indices
            dn_match_indices = self.get_dn_match_indices(
                gt_class, dn_positive_idx, dn_num_group)

            # compute denoising training loss
            num_gts *= dn_num_group
            dn_loss = super(DINOLoss, self).forward(
                dn_out_bboxes,
                dn_out_logits,
                gt_bbox,
                gt_class,
                postfix="_dn",
                dn_match_indices=dn_match_indices,
                num_gts=num_gts,
                gt_score=gt_score)
            total_loss.update(dn_loss)
        else:
            total_loss.update(
                {k + '_dn': paddle.to_tensor([0.])
                 for k in total_loss.keys()})

        return total_loss

    @staticmethod
    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
        dn_match_indices = []
        for i in range(len(labels)):
            num_gt = len(labels[i])
            if num_gt > 0:
                gt_idx = paddle.arange(end=num_gt, dtype="int64")
                gt_idx = gt_idx.tile([dn_num_group])
                assert len(dn_positive_idx[i]) == len(gt_idx)
                dn_match_indices.append((dn_positive_idx[i], gt_idx))
            else:
                dn_match_indices.append((paddle.zeros(
                    [0], dtype="int64"), paddle.zeros(
                        [0], dtype="int64")))
        return dn_match_indices

@register
class DINOv3Loss(DETRLoss):
    def forward(self,
                boxes,
                logits,
                gt_bbox,
                gt_class,
                masks=None,
                gt_mask=None,
                postfix="",
                dn_out_bboxes=None,
                dn_out_logits=None,
                dn_meta=None,
                gt_score=None,
                o2m=1,
                **kwargs):
        
        if o2m != 1:
            gt_boxes_copy = [box.tile([o2m, 1]) for box in gt_bbox]
            gt_class_copy = [label.tile([o2m, 1]) for label in gt_class]
        else:
            gt_boxes_copy = gt_bbox
            gt_class_copy = gt_class
        num_gts_copy = self._get_num_gts(gt_class_copy)
        total_loss = self._get_prediction_loss(
            boxes[-1],
            logits[-1],
            gt_boxes_copy,
            gt_class_copy,
            masks=masks[-1] if masks is not None else None,
            gt_mask=gt_mask,
            postfix=postfix,
            dn_match_indices=None,
            num_gts=num_gts_copy,
            gt_score=gt_score if gt_score is not None else None)

        if self.aux_loss:
            total_loss.update(
                self._get_loss_aux(
                    boxes[:-1],
                    logits[:-1],
                    gt_boxes_copy,
                    gt_class_copy,
                    self.num_classes,
                    num_gts_copy,
                    dn_match_indices=None,
                    postfix=postfix,
                    masks=masks[:-1] if masks is not None else None,
                    gt_mask=gt_mask,
                    gt_score=gt_score if gt_score is not None else None))

        if dn_meta is not None:
            num_gts = self._get_num_gts(gt_class)
            dn_positive_idx, dn_num_group = \
                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
            assert len(gt_class) == len(dn_positive_idx)

            # denoising match indices
            dn_match_indices = self.get_dn_match_indices(
                gt_class, dn_positive_idx, dn_num_group)

            # compute denoising training loss
            num_gts *= dn_num_group
            dn_loss = super(DINOv3Loss, self).forward(
                dn_out_bboxes,
                dn_out_logits,
                gt_bbox,
                gt_class,
                postfix="_dn",
                dn_match_indices=dn_match_indices,
                num_gts=num_gts,
                gt_score=gt_score)
            total_loss.update(dn_loss)
        else:
            total_loss.update(
                {k + '_dn': paddle.to_tensor([0.])
                 for k in total_loss.keys()})

        return total_loss

    @staticmethod
    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
        dn_match_indices = []
        for i in range(len(labels)):
            num_gt = len(labels[i])
            if num_gt > 0:
                gt_idx = paddle.arange(end=num_gt, dtype="int64")
                gt_idx = gt_idx.tile([dn_num_group])
                assert len(dn_positive_idx[i]) == len(gt_idx)
                dn_match_indices.append((dn_positive_idx[i], gt_idx))
            else:
                dn_match_indices.append((paddle.zeros(
                    [0], dtype="int64"), paddle.zeros(
                        [0], dtype="int64")))
        return dn_match_indices

@register
class MaskDINOLoss(DETRLoss):
    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
    __inject__ = ['matcher']

    def __init__(self,
                 num_classes=80,
                 matcher='HungarianMatcher',
                 loss_coeff={
                     'class': 4,
                     'bbox': 5,
                     'giou': 2,
                     'mask': 5,
                     'dice': 5
                 },
                 aux_loss=True,
                 use_focal_loss=False,
                 use_vfl=False,
                 vfl_iou_type='bbox',
                 num_sample_points=12544,
                 oversample_ratio=3.0,
                 important_sample_ratio=0.75):
        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
                                           aux_loss, use_focal_loss, use_vfl, vfl_iou_type)
        assert oversample_ratio >= 1
        assert important_sample_ratio <= 1 and important_sample_ratio >= 0

        self.num_sample_points = num_sample_points
        self.oversample_ratio = oversample_ratio
        self.important_sample_ratio = important_sample_ratio
        self.num_oversample_points = int(num_sample_points * oversample_ratio)
        self.num_important_points = int(num_sample_points *
                                        important_sample_ratio)
        self.num_random_points = num_sample_points - self.num_important_points

    def forward(self,
                boxes,
                logits,
                gt_bbox,
                gt_class,
                masks=None,
                gt_mask=None,
                postfix="",
                dn_out_bboxes=None,
                dn_out_logits=None,
                dn_out_masks=None,
                dn_meta=None,
                **kwargs):
        num_gts = self._get_num_gts(gt_class)
        total_loss = super(MaskDINOLoss, self).forward(
            boxes,
            logits,
            gt_bbox,
            gt_class,
            masks=masks,
            gt_mask=gt_mask,
            num_gts=num_gts)

        if dn_meta is not None:
            dn_positive_idx, dn_num_group = \
                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
            assert len(gt_class) == len(dn_positive_idx)

            # denoising match indices
            dn_match_indices = DINOLoss.get_dn_match_indices(
                gt_class, dn_positive_idx, dn_num_group)

            # compute denoising training loss
            num_gts *= dn_num_group
            dn_loss = super(MaskDINOLoss, self).forward(
                dn_out_bboxes,
                dn_out_logits,
                gt_bbox,
                gt_class,
                masks=dn_out_masks,
                gt_mask=gt_mask,
                postfix="_dn",
                dn_match_indices=dn_match_indices,
                num_gts=num_gts)
            total_loss.update(dn_loss)
        else:
            total_loss.update(
                {k + '_dn': paddle.to_tensor([0.])
                 for k in total_loss.keys()})

        return total_loss

    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
                       postfix=""):
        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
        name_mask = "loss_mask" + postfix
        name_dice = "loss_dice" + postfix

        loss = dict()
        if sum(len(a) for a in gt_mask) == 0:
            loss[name_mask] = paddle.to_tensor([0.])
            loss[name_dice] = paddle.to_tensor([0.])
            return loss

        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
                                                              match_indices)
        # sample points
        sample_points = self._get_point_coords_by_uncertainty(src_masks)
        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0

        src_masks = F.grid_sample(
            src_masks.unsqueeze(1), sample_points,
            align_corners=False).squeeze([1, 2])

        target_masks = F.grid_sample(
            target_masks.unsqueeze(1), sample_points,
            align_corners=False).squeeze([1, 2]).detach()

        loss[name_mask] = self.loss_coeff[
            'mask'] * F.binary_cross_entropy_with_logits(
                src_masks, target_masks,
                reduction='none').mean(1).sum() / num_gts
        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
            src_masks, target_masks, num_gts)
        return loss

    def _get_point_coords_by_uncertainty(self, masks):
        # Sample points based on their uncertainty.
        masks = masks.detach()
        num_masks = masks.shape[0]
        sample_points = paddle.rand(
            [num_masks, 1, self.num_oversample_points, 2])

        out_mask = F.grid_sample(
            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
            align_corners=False).squeeze([1, 2])
        out_mask = -paddle.abs(out_mask)

        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
        if self.num_random_points > 0:
            sample_points = paddle.concat(
                [
                    sample_points,
                    paddle.rand([num_masks, self.num_random_points, 2])
                ],
                axis=1)
        return sample_points

================================================
FILE: ppdet/modeling/losses/fairmot_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
from paddle.nn.initializer import Constant
from ppdet.core.workspace import register

__all__ = ['FairMOTLoss']


@register
class FairMOTLoss(nn.Layer):
    def __init__(self):
        super(FairMOTLoss, self).__init__()
        self.det_weight = self.create_parameter(
            shape=[1], default_initializer=Constant(-1.85))
        self.reid_weight = self.create_parameter(
            shape=[1], default_initializer=Constant(-1.05))

    def forward(self, det_loss, reid_loss):
        loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp(
            -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight
                                              )
        loss *= 0.5
        return {'loss': loss}


================================================
FILE: ppdet/modeling/losses/fcos_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling import ops
from functools import partial

__all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR']


def flatten_tensor(inputs, channel_first=False):
    """
    Flatten a Tensor
    Args:
        inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C]
        channel_first (bool): If true the dimension order of Tensor is 
            [N, C, H, W], otherwise is [N, H, W, C]
    Return:
        output_channel_last (Tensor): The flattened Tensor in channel_last style
    """
    if channel_first:
        input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1])
    else:
        input_channel_last = inputs
    output_channel_last = paddle.flatten(
        input_channel_last, start_axis=0, stop_axis=2)
    return output_channel_last


@register
class FCOSLoss(nn.Layer):
    """
    FCOSLoss
    Args:
        loss_alpha (float): alpha in focal loss
        loss_gamma (float): gamma in focal loss
        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
        reg_weights (float): weight for location loss
        quality (str): quality branch, centerness/iou
    """

    def __init__(self,
                 loss_alpha=0.25,
                 loss_gamma=2.0,
                 iou_loss_type="giou",
                 reg_weights=1.0,
                 quality='centerness'):
        super(FCOSLoss, self).__init__()
        self.loss_alpha = loss_alpha
        self.loss_gamma = loss_gamma
        self.iou_loss_type = iou_loss_type
        self.reg_weights = reg_weights
        self.quality = quality

    def _iou_loss(self,
                  pred,
                  targets,
                  positive_mask,
                  weights=None,
                  return_iou=False):
        """
        Calculate the loss for location prediction
        Args:
            pred (Tensor): bounding boxes prediction
            targets (Tensor): targets for positive samples
            positive_mask (Tensor): mask of positive samples
            weights (Tensor): weights for each positive samples
        Return:
            loss (Tensor): location loss
        """
        plw = pred[:, 0] * positive_mask
        pth = pred[:, 1] * positive_mask
        prw = pred[:, 2] * positive_mask
        pbh = pred[:, 3] * positive_mask

        tlw = targets[:, 0] * positive_mask
        tth = targets[:, 1] * positive_mask
        trw = targets[:, 2] * positive_mask
        tbh = targets[:, 3] * positive_mask
        tlw.stop_gradient = True
        trw.stop_gradient = True
        tth.stop_gradient = True
        tbh.stop_gradient = True

        ilw = paddle.minimum(plw, tlw)
        irw = paddle.minimum(prw, trw)
        ith = paddle.minimum(pth, tth)
        ibh = paddle.minimum(pbh, tbh)

        clw = paddle.maximum(plw, tlw)
        crw = paddle.maximum(prw, trw)
        cth = paddle.maximum(pth, tth)
        cbh = paddle.maximum(pbh, tbh)

        area_predict = (plw + prw) * (pth + pbh)
        area_target = (tlw + trw) * (tth + tbh)
        area_inter = (ilw + irw) * (ith + ibh)
        ious = (area_inter + 1.0) / (
            area_predict + area_target - area_inter + 1.0)
        ious = ious * positive_mask

        if return_iou:
            return ious

        if self.iou_loss_type.lower() == "linear_iou":
            loss = 1.0 - ious
        elif self.iou_loss_type.lower() == "giou":
            area_uniou = area_predict + area_target - area_inter
            area_circum = (clw + crw) * (cth + cbh) + 1e-7
            giou = ious - (area_circum - area_uniou) / area_circum
            loss = 1.0 - giou
        elif self.iou_loss_type.lower() == "iou":
            loss = 0.0 - paddle.log(ious)
        else:
            raise KeyError
        if weights is not None:
            loss = loss * weights
        return loss

    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
                tag_bboxes, tag_center):
        """
        Calculate the loss for classification, location and centerness
        Args:
            cls_logits (list): list of Tensor, which is predicted
                score for all anchor points with shape [N, M, C]
            bboxes_reg (list): list of Tensor, which is predicted
                offsets for all anchor points with shape [N, M, 4]
            centerness (list): list of Tensor, which is predicted
                centerness for all anchor points with shape [N, M, 1]
            tag_labels (list): list of Tensor, which is category
                targets for each anchor point
            tag_bboxes (list): list of Tensor, which is bounding
                boxes targets for positive samples
            tag_center (list): list of Tensor, which is centerness
                targets for positive samples
        Return:
            loss (dict): loss composed by classification loss, bounding box
        """
        cls_logits_flatten_list = []
        bboxes_reg_flatten_list = []
        centerness_flatten_list = []
        tag_labels_flatten_list = []
        tag_bboxes_flatten_list = []
        tag_center_flatten_list = []
        num_lvl = len(cls_logits)
        for lvl in range(num_lvl):
            cls_logits_flatten_list.append(
                flatten_tensor(cls_logits[lvl], True))
            bboxes_reg_flatten_list.append(
                flatten_tensor(bboxes_reg[lvl], True))
            centerness_flatten_list.append(
                flatten_tensor(centerness[lvl], True))

            tag_labels_flatten_list.append(
                flatten_tensor(tag_labels[lvl], False))
            tag_bboxes_flatten_list.append(
                flatten_tensor(tag_bboxes[lvl], False))
            tag_center_flatten_list.append(
                flatten_tensor(tag_center[lvl], False))

        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)

        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
        tag_labels_flatten.stop_gradient = True
        tag_bboxes_flatten.stop_gradient = True
        tag_center_flatten.stop_gradient = True

        mask_positive_bool = tag_labels_flatten > 0
        mask_positive_bool.stop_gradient = True
        mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
        mask_positive_float.stop_gradient = True

        num_positive_fp32 = paddle.sum(mask_positive_float)
        num_positive_fp32.stop_gradient = True
        num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
        num_positive_int32 = num_positive_int32 * 0 + 1
        num_positive_int32.stop_gradient = True

        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
        normalize_sum.stop_gradient = True

        # 1. cls_logits: sigmoid_focal_loss
        # expand onehot labels
        num_classes = cls_logits_flatten.shape[-1]
        tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1)
        tag_labels_flatten_bin = F.one_hot(
            tag_labels_flatten, num_classes=1 + num_classes)
        tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:]
        # sigmoid_focal_loss
        cls_loss = F.sigmoid_focal_loss(
            cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32

        if self.quality == 'centerness':
            # 2. bboxes_reg: giou_loss
            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
            reg_loss = self._iou_loss(
                bboxes_reg_flatten,
                tag_bboxes_flatten,
                mask_positive_float,
                weights=tag_center_flatten)
            reg_loss = reg_loss * mask_positive_float / normalize_sum

            # 3. centerness: sigmoid_cross_entropy_with_logits_loss
            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
            quality_loss = ops.sigmoid_cross_entropy_with_logits(
                centerness_flatten, tag_center_flatten)
            quality_loss = quality_loss * mask_positive_float / num_positive_fp32

        elif self.quality == 'iou':
            # 2. bboxes_reg: giou_loss
            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
            reg_loss = self._iou_loss(
                bboxes_reg_flatten,
                tag_bboxes_flatten,
                mask_positive_float,
                weights=None)
            reg_loss = reg_loss * mask_positive_float / num_positive_fp32
            # num_positive_fp32 is num_foreground

            # 3. centerness: sigmoid_cross_entropy_with_logits_loss
            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
            gt_ious = self._iou_loss(
                bboxes_reg_flatten,
                tag_bboxes_flatten,
                mask_positive_float,
                weights=None,
                return_iou=True)
            quality_loss = ops.sigmoid_cross_entropy_with_logits(
                centerness_flatten, gt_ious)
            quality_loss = quality_loss * mask_positive_float / num_positive_fp32
        else:
            raise Exception(f'Unknown quality type: {self.quality}')

        loss_all = {
            "loss_cls": paddle.sum(cls_loss),
            "loss_box": paddle.sum(reg_loss),
            "loss_quality": paddle.sum(quality_loss),
        }
        return loss_all


@register
class FCOSLossMILC(FCOSLoss):
    """
    FCOSLossMILC for ARSL in semi-det(ssod)
    Args:
        loss_alpha (float): alpha in focal loss
        loss_gamma (float): gamma in focal loss
        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
        reg_weights (float): weight for location loss
    """

    def __init__(self,
                 loss_alpha=0.25,
                 loss_gamma=2.0,
                 iou_loss_type="giou",
                 reg_weights=1.0):
        super(FCOSLossMILC, self).__init__()
        self.loss_alpha = loss_alpha
        self.loss_gamma = loss_gamma
        self.iou_loss_type = iou_loss_type
        self.reg_weights = reg_weights

    def iou_loss(self, pred, targets, weights=None, avg_factor=None):
        """
        Calculate the loss for location prediction
        Args:
            pred (Tensor): bounding boxes prediction
            targets (Tensor): targets for positive samples
            weights (Tensor): weights for each positive samples
        Return:
            loss (Tensor): location loss
        """
        plw = pred[:, 0]
        pth = pred[:, 1]
        prw = pred[:, 2]
        pbh = pred[:, 3]

        tlw = targets[:, 0]
        tth = targets[:, 1]
        trw = targets[:, 2]
        tbh = targets[:, 3]
        tlw.stop_gradient = True
        trw.stop_gradient = True
        tth.stop_gradient = True
        tbh.stop_gradient = True

        ilw = paddle.minimum(plw, tlw)
        irw = paddle.minimum(prw, trw)
        ith = paddle.minimum(pth, tth)
        ibh = paddle.minimum(pbh, tbh)

        clw = paddle.maximum(plw, tlw)
        crw = paddle.maximum(prw, trw)
        cth = paddle.maximum(pth, tth)
        cbh = paddle.maximum(pbh, tbh)

        area_predict = (plw + prw) * (pth + pbh)
        area_target = (tlw + trw) * (tth + tbh)
        area_inter = (ilw + irw) * (ith + ibh)
        ious = (area_inter + 1.0) / (
            area_predict + area_target - area_inter + 1.0)
        ious = ious

        if self.iou_loss_type.lower() == "linear_iou":
            loss = 1.0 - ious
        elif self.iou_loss_type.lower() == "giou":
            area_uniou = area_predict + area_target - area_inter
            area_circum = (clw + crw) * (cth + cbh) + 1e-7
            giou = ious - (area_circum - area_uniou) / area_circum
            loss = 1.0 - giou
        elif self.iou_loss_type.lower() == "iou":
            loss = 0.0 - paddle.log(ious)
        else:
            raise KeyError
        if weights is not None:
            loss = loss * weights
        loss = paddle.sum(loss)
        if avg_factor is not None:
            loss = loss / avg_factor
        return loss

    # temp function: calcualate iou between bbox and target
    def _bbox_overlap_align(self, pred, targets):
        assert pred.shape[0] == targets.shape[0], \
        'the pred should be aligned with target.'

        plw = pred[:, 0]
        pth = pred[:, 1]
        prw = pred[:, 2]
        pbh = pred[:, 3]

        tlw = targets[:, 0]
        tth = targets[:, 1]
        trw = targets[:, 2]
        tbh = targets[:, 3]

        ilw = paddle.minimum(plw, tlw)
        irw = paddle.minimum(prw, trw)
        ith = paddle.minimum(pth, tth)
        ibh = paddle.minimum(pbh, tbh)

        area_predict = (plw + prw) * (pth + pbh)
        area_target = (tlw + trw) * (tth + tbh)
        area_inter = (ilw + irw) * (ith + ibh)
        ious = (area_inter + 1.0) / (
            area_predict + area_target - area_inter + 1.0)

        return ious

    def iou_based_soft_label_loss(self,
                                  pred,
                                  target,
                                  alpha=0.75,
                                  gamma=2.0,
                                  iou_weighted=False,
                                  implicit_iou=None,
                                  avg_factor=None):
        assert pred.shape == target.shape
        pred = F.sigmoid(pred)
        target = target.cast(pred.dtype)

        if implicit_iou is not None:
            pred = pred * implicit_iou

        if iou_weighted:
            focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \
                alpha * (pred - target).abs().pow(gamma) * \
                (target <= 0.0).cast('float32')
        else:
            focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \
                alpha * (pred - target).abs().pow(gamma) * \
                (target <= 0.0).cast('float32')

        # focal loss
        loss = F.binary_cross_entropy(
            pred, target, reduction='none') * focal_weight
        if avg_factor is not None:
            loss = loss / avg_factor
        return loss

    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
                tag_bboxes, tag_center):
        """
        Calculate the loss for classification, location and centerness
        Args:
            cls_logits (list): list of Tensor, which is predicted
                score for all anchor points with shape [N, M, C]
            bboxes_reg (list): list of Tensor, which is predicted
                offsets for all anchor points with shape [N, M, 4]
            centerness (list): list of Tensor, which is predicted
                centerness for all anchor points with shape [N, M, 1]
            tag_labels (list): list of Tensor, which is category
                targets for each anchor point
            tag_bboxes (list): list of Tensor, which is bounding
                boxes targets for positive samples
            tag_center (list): list of Tensor, which is centerness
                targets for positive samples
        Return:
            loss (dict): loss composed by classification loss, bounding box
        """
        cls_logits_flatten_list = []
        bboxes_reg_flatten_list = []
        centerness_flatten_list = []
        tag_labels_flatten_list = []
        tag_bboxes_flatten_list = []
        tag_center_flatten_list = []
        num_lvl = len(cls_logits)
        for lvl in range(num_lvl):
            cls_logits_flatten_list.append(
                flatten_tensor(cls_logits[lvl], True))
            bboxes_reg_flatten_list.append(
                flatten_tensor(bboxes_reg[lvl], True))
            centerness_flatten_list.append(
                flatten_tensor(centerness[lvl], True))

            tag_labels_flatten_list.append(
                flatten_tensor(tag_labels[lvl], False))
            tag_bboxes_flatten_list.append(
                flatten_tensor(tag_bboxes[lvl], False))
            tag_center_flatten_list.append(
                flatten_tensor(tag_center[lvl], False))

        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)

        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
        tag_labels_flatten.stop_gradient = True
        tag_bboxes_flatten.stop_gradient = True
        tag_center_flatten.stop_gradient = True

        # find positive index
        mask_positive_bool = tag_labels_flatten > 0
        mask_positive_bool.stop_gradient = True
        mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
        mask_positive_float.stop_gradient = True

        num_positive_fp32 = paddle.sum(mask_positive_float)
        num_positive_fp32.stop_gradient = True
        num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
        num_positive_int32 = num_positive_int32 * 0 + 1
        num_positive_int32.stop_gradient = True

        # centerness target is used as reg weight
        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
        normalize_sum.stop_gradient = True

        # 1. IoU-Based soft label loss
        # calculate iou
        with paddle.no_grad():
            pos_ind = paddle.nonzero(
                tag_labels_flatten.reshape([-1]) > 0).reshape([-1])
            pos_pred = bboxes_reg_flatten[pos_ind]
            pos_target = tag_bboxes_flatten[pos_ind]
            bbox_iou = self._bbox_overlap_align(pos_pred, pos_target)
        # pos labels
        pos_labels = tag_labels_flatten[pos_ind].squeeze(1)
        cls_target = paddle.zeros(cls_logits_flatten.shape)
        cls_target[pos_ind, pos_labels - 1] = bbox_iou
        cls_loss = self.iou_based_soft_label_loss(
            cls_logits_flatten,
            cls_target,
            implicit_iou=F.sigmoid(centerness_flatten),
            avg_factor=num_positive_fp32)

        # 2. bboxes_reg: giou_loss
        mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
        tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
        reg_loss = self._iou_loss(
            bboxes_reg_flatten,
            tag_bboxes_flatten,
            mask_positive_float,
            weights=tag_center_flatten)
        reg_loss = reg_loss * mask_positive_float / normalize_sum

        # 3. iou loss
        pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind]
        loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou)
        loss_iou = loss_iou / num_positive_fp32 * 0.5

        loss_all = {
            "loss_cls": paddle.sum(cls_loss),
            "loss_box": paddle.sum(reg_loss),
            'loss_iou': paddle.sum(loss_iou),
        }

        return loss_all


# Concat multi-level feature maps by image
def levels_to_images(mlvl_tensor):
    batch_size = mlvl_tensor[0].shape[0]
    batch_list = [[] for _ in range(batch_size)]
    channels = mlvl_tensor[0].shape[1]
    for t in mlvl_tensor:
        t = t.transpose([0, 2, 3, 1])
        t = t.reshape([batch_size, -1, channels])
        for img in range(batch_size):
            batch_list[img].append(t[img])
    return [paddle.concat(item, axis=0) for item in batch_list]


def multi_apply(func, *args, **kwargs):
    """Apply function to a list of arguments.

    Note:
        This function applies the ``func`` to multiple inputs and
        map the multiple outputs of the ``func`` into different
        list. Each list contains the same type of outputs corresponding
        to different inputs.

    Args:
        func (Function): A function that will be applied to a list of
            arguments

    Returns:
        tuple(list): A tuple containing multiple list, each list contains \
            a kind of returned results by the function
    """
    pfunc = partial(func, **kwargs) if kwargs else func
    map_results = map(pfunc, *args)
    return tuple(map(list, zip(*map_results)))


@register
class FCOSLossCR(FCOSLossMILC):
    """
    FCOSLoss of Consistency Regularization
    """

    def __init__(self,
                 iou_loss_type="giou",
                 cls_weight=2.0,
                 reg_weight=2.0,
                 iou_weight=0.5,
                 hard_neg_mining_flag=True):
        super(FCOSLossCR, self).__init__()
        self.iou_loss_type = iou_loss_type
        self.cls_weight = cls_weight
        self.reg_weight = reg_weight
        self.iou_weight = iou_weight
        self.hard_neg_mining_flag = hard_neg_mining_flag

    def iou_loss(self, pred, targets, weights=None, avg_factor=None):
        """
            Calculate the loss for location prediction
            Args:
                pred (Tensor): bounding boxes prediction
                targets (Tensor): targets for positive samples
                weights (Tensor): weights for each positive samples
            Return:
                loss (Tensor): location loss
            """
        plw = pred[:, 0]
        pth = pred[:, 1]
        prw = pred[:, 2]
        pbh = pred[:, 3]

        tlw = targets[:, 0]
        tth = targets[:, 1]
        trw = targets[:, 2]
        tbh = targets[:, 3]
        tlw.stop_gradient = True
        trw.stop_gradient = True
        tth.stop_gradient = True
        tbh.stop_gradient = True

        ilw = paddle.minimum(plw, tlw)
        irw = paddle.minimum(prw, trw)
        ith = paddle.minimum(pth, tth)
        ibh = paddle.minimum(pbh, tbh)

        clw = paddle.maximum(plw, tlw)
        crw = paddle.maximum(prw, trw)
        cth = paddle.maximum(pth, tth)
        cbh = paddle.maximum(pbh, tbh)

        area_predict = (plw + prw) * (pth + pbh)
        area_target = (tlw + trw) * (tth + tbh)
        area_inter = (ilw + irw) * (ith + ibh)
        ious = (area_inter + 1.0) / (
            area_predict + area_target - area_inter + 1.0)
        ious = ious

        if self.iou_loss_type.lower() == "linear_iou":
            loss = 1.0 - ious
        elif self.iou_loss_type.lower() == "giou":
            area_uniou = area_predict + area_target - area_inter
            area_circum = (clw + crw) * (cth + cbh) + 1e-7
            giou = ious - (area_circum - area_uniou) / area_circum
            loss = 1.0 - giou
        elif self.iou_loss_type.lower() == "iou":
            loss = 0.0 - paddle.log(ious)
        else:
            raise KeyError
        if weights is not None:
            loss = loss * weights
        loss = paddle.sum(loss)
        if avg_factor is not None:
            loss = loss / avg_factor
        return loss

    # calcualate iou between bbox and target
    def bbox_overlap_align(self, pred, targets):
        assert pred.shape[0] == targets.shape[0], \
        'the pred should be aligned with target.'

        plw = pred[:, 0]
        pth = pred[:, 1]
        prw = pred[:, 2]
        pbh = pred[:, 3]

        tlw = targets[:, 0]
        tth = targets[:, 1]
        trw = targets[:, 2]
        tbh = targets[:, 3]

        ilw = paddle.minimum(plw, tlw)
        irw = paddle.minimum(prw, trw)
        ith = paddle.minimum(pth, tth)
        ibh = paddle.minimum(pbh, tbh)

        area_predict = (plw + prw) * (pth + pbh)
        area_target = (tlw + trw) * (tth + tbh)
        area_inter = (ilw + irw) * (ith + ibh)
        ious = (area_inter + 1.0) / (
            area_predict + area_target - area_inter + 1.0)
        return ious

    # cls loss: iou-based soft lable with joint iou
    def quality_focal_loss(self,
                           stu_cls,
                           targets,
                           quality=None,
                           weights=None,
                           alpha=0.75,
                           gamma=2.0,
                           avg_factor='sum'):
        stu_cls = F.sigmoid(stu_cls)
        if quality is not None:
            stu_cls = stu_cls * F.sigmoid(quality)

        focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \
            alpha * (stu_cls - targets).abs().pow(gamma) * \
            (targets <= 0.0).cast('float32')

        loss = F.binary_cross_entropy(
            stu_cls, targets, reduction='none') * focal_weight

        if weights is not None:
            loss = loss * weights.reshape([-1, 1])
        loss = paddle.sum(loss)
        if avg_factor is not None:
            loss = loss / avg_factor
        return loss

    # generate points according to feature maps
    def compute_locations_by_level(self, fpn_stride, h, w):
        """
        Compute locations of anchor points of each FPN layer
        Return:
            Anchor points locations of current FPN feature map
        """
        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
        shift_x = paddle.unsqueeze(shift_x, axis=0)
        shift_y = paddle.unsqueeze(shift_y, axis=1)
        shift_x = paddle.expand(shift_x, shape=[h, w])
        shift_y = paddle.expand(shift_y, shape=[h, w])
        shift_x = paddle.reshape(shift_x, shape=[-1])
        shift_y = paddle.reshape(shift_y, shape=[-1])
        location = paddle.stack(
            [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2
        return location

    # decode bbox from ltrb to x1y1x2y2
    def decode_bbox(self, ltrb, points):
        assert ltrb.shape[0] == points.shape[0], \
        "When decoding bbox in one image, the num of loc should be same with points."
        bbox_decoding = paddle.stack(
            [
                points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1],
                points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3]
            ],
            axis=1)
        return bbox_decoding

    # encode bbox from x1y1x2y2 to ltrb
    def encode_bbox(self, bbox, points):
        assert bbox.shape[0] == points.shape[0], \
        "When encoding bbox in one image, the num of bbox should be same with points."
        bbox_encoding = paddle.stack(
            [
                points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1],
                bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1]
            ],
            axis=1)
        return bbox_encoding

    def calcualate_iou(self, gt_bbox, predict_bbox):
        # bbox area
        gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \
             (gt_bbox[:, 3] - gt_bbox[:, 1])
        predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \
             (predict_bbox[:, 3] - predict_bbox[:, 1])
        # overlop area
        lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2])
        rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:])
        wh = paddle.clip(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]
        # iou
        iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap)
        return iou

    # select potential positives from hard negatives 
    def hard_neg_mining(self,
                        cls_score,
                        loc_ltrb,
                        quality,
                        pos_ind,
                        hard_neg_ind,
                        loc_mask,
                        loc_targets,
                        iou_thresh=0.6):
        # get points locations and strides
        points_list = []
        strides_list = []
        scale_list = []
        scale = [0, 1, 2, 3, 4]
        for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride,
                                             self.lvl_hw):
            h, w = HW
            lvl_points = self.compute_locations_by_level(fpn_stride, h, w)
            points_list.append(lvl_points)
            lvl_strides = paddle.full([h * w, 1], fpn_stride)
            strides_list.append(lvl_strides)
            lvl_scales = paddle.full([h * w, 1], fpn_scale)
            scale_list.append(lvl_scales)
        points = paddle.concat(points_list, axis=0)
        strides = paddle.concat(strides_list, axis=0)
        scales = paddle.concat(scale_list, axis=0)

        # cls scores
        cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality)
        max_vals = paddle.max(cls_vals, axis=-1)
        class_ind = paddle.argmax(cls_vals, axis=-1)

        ### calculate iou between positive and hard negative
        # decode pos bbox
        pos_cls = max_vals[pos_ind]
        pos_loc = loc_ltrb[pos_ind].reshape([-1, 4])
        pos_strides = strides[pos_ind]
        pos_points = points[pos_ind].reshape([-1, 2])
        pos_loc = pos_loc * pos_strides
        pos_bbox = self.decode_bbox(pos_loc, pos_points)
        pos_scales = scales[pos_ind]
        # decode hard negative bbox
        hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4])
        hard_neg_strides = strides[hard_neg_ind]
        hard_neg_points = points[hard_neg_ind].reshape([-1, 2])
        hard_neg_loc = hard_neg_loc * hard_neg_strides
        hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points)
        hard_neg_scales = scales[hard_neg_ind]
        # iou between pos bbox and hard negative bbox
        hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox)

        ### select potential positives from hard negatives
        # scale flag
        scale_temp = paddle.abs(
            pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1])
            [:, None])
        scale_flag = (scale_temp <= 1.)
        # iou flag
        iou_flag = (hard_neg_pos_iou >= iou_thresh)
        # same class flag
        pos_class = class_ind[pos_ind]
        hard_neg_class = class_ind[hard_neg_ind]
        class_flag = pos_class[None, :] - hard_neg_class[:, None]
        class_flag = (class_flag == 0)
        # hard negative point inside positive bbox flag
        ltrb_temp = paddle.stack(
            [
                hard_neg_points[:, None, 0] - pos_bbox[None, :, 0],
                hard_neg_points[:, None, 1] - pos_bbox[None, :, 1],
                pos_bbox[None, :, 2] - hard_neg_points[:, None, 0],
                pos_bbox[None, :, 3] - hard_neg_points[:, None, 1]
            ],
            axis=-1)
        inside_flag = ltrb_temp.min(axis=-1) > 0
        # reset iou
        valid_flag = (iou_flag & class_flag & inside_flag & scale_flag)
        invalid_iou = paddle.zeros_like(hard_neg_pos_iou)
        hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou,
                                        invalid_iou)
        pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1)
        # selece potential pos
        potential_pos_ind = (pos_hard_neg_max_iou > 0.)
        num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0]
        if num_potential_pos == 0:
            return None

        ### calculate loc target：aggregate all matching bboxes as the bbox targets of potential pos
        # prepare data
        potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2])
        potential_strides = hard_neg_strides[potential_pos_ind]
        potential_valid_flag = valid_flag[potential_pos_ind]
        potential_pos_ind = hard_neg_ind[potential_pos_ind]

        # get cls and box of matching positives
        pos_cls = max_vals[pos_ind]
        expand_pos_bbox = paddle.expand(
            pos_bbox,
            shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]])
        expand_pos_cls = paddle.expand(
            pos_cls, shape=[num_potential_pos, pos_cls.shape[0]])
        invalid_cls = paddle.zeros_like(expand_pos_cls)
        expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls,
                                      invalid_cls)
        expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1)
        # aggregate box based on cls_score
        agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \
            / expand_pos_cls.sum(axis=1)
        agg_ltrb = self.encode_bbox(agg_bbox, potential_points)
        agg_ltrb = agg_ltrb / potential_strides

        # loc target for all pos
        loc_targets[potential_pos_ind] = agg_ltrb
        loc_mask[potential_pos_ind] = 1.

        return loc_mask, loc_targets

    # get training targets
    def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc,
                            stu_iou):

        ### sample selection
        # prepare datas
        tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou)
        class_ind = paddle.argmax(tea_cls_scores, axis=-1)
        max_vals = paddle.max(tea_cls_scores, axis=-1)
        cls_mask = paddle.zeros_like(
            max_vals
        )  # set cls valid mask: pos is 1, hard_negative and negative are 0.
        num_pos, num_hard_neg = 0, 0

        # mean-std selection
        # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following.
        # using squeeze rather than reshape to avoid errors when no score is larger than thresh.
        candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1)
        num_candidate = candidate_ind.shape[0]
        if num_candidate > 0:
            # pos thresh = mean + std to select pos samples
            candidate_score = max_vals[candidate_ind]
            candidate_score_mean = candidate_score.mean()
            candidate_score_std = candidate_score.std()
            pos_thresh = (candidate_score_mean + candidate_score_std).clip(
                max=0.4)
            # select pos
            pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1)
            num_pos = pos_ind.shape[0]
            # select hard negatives as potential pos
            hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh)
            hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1)
            num_hard_neg = hard_neg_ind.shape[0]
        # if not positive, directly select top-10 as pos.
        if (num_pos == 0):
            num_pos = 10
            _, pos_ind = paddle.topk(max_vals, k=num_pos)
        cls_mask[pos_ind] = 1.

        ### Consistency Regularization Training targets
        # cls targets
        pos_class_ind = class_ind[pos_ind]
        cls_targets = paddle.zeros_like(tea_cls)
        cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind,
                                                             pos_class_ind]
        # hard negative cls target
        if num_hard_neg != 0:
            cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind]
        # loc targets
        loc_targets = paddle.zeros_like(tea_loc)
        loc_targets[pos_ind] = tea_loc[pos_ind]
        # iou targets
        iou_targets = paddle.zeros(
            shape=[tea_iou.shape[0]], dtype=tea_iou.dtype)
        iou_targets[pos_ind] = F.sigmoid(
            paddle.squeeze(
                tea_iou, axis=-1)[pos_ind])

        loc_mask = cls_mask.clone()
        # select potential positive from hard negatives for loc_task training
        if (num_hard_neg > 0) and self.hard_neg_mining_flag:
            results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind,
                                           hard_neg_ind, loc_mask, loc_targets)
            if results is not None:
                loc_mask, loc_targets = results
                loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
                iou_targets[loc_pos_ind] = F.sigmoid(
                    paddle.squeeze(
                        tea_iou, axis=-1)[loc_pos_ind])

        return cls_mask, loc_mask, \
               cls_targets, loc_targets, iou_targets

    def forward(self, student_prediction, teacher_prediction):
        stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction
        tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction

        # H and W of level (used for aggregating targets)
        self.lvl_hw = []
        for t in tea_cls_lvl:
            _, _, H, W = t.shape
            self.lvl_hw.append([H, W])

        # levels to images
        stu_cls_img = levels_to_images(stu_cls_lvl)
        stu_loc_img = levels_to_images(stu_loc_lvl)
        stu_iou_img = levels_to_images(stu_iou_lvl)
        tea_cls_img = levels_to_images(tea_cls_lvl)
        tea_loc_img = levels_to_images(tea_loc_lvl)
        tea_iou_img = levels_to_images(tea_iou_lvl)

        with paddle.no_grad():
            cls_mask, loc_mask, \
            cls_targets, loc_targets, iou_targets = multi_apply(
                self.get_targets_per_img,
                tea_cls_img,
                tea_loc_img,
                tea_iou_img,
                stu_cls_img,
                stu_loc_img,
                stu_iou_img
            )

        # flatten preditction
        stu_cls = paddle.concat(stu_cls_img, axis=0)
        stu_loc = paddle.concat(stu_loc_img, axis=0)
        stu_iou = paddle.concat(stu_iou_img, axis=0)
        # flatten targets
        cls_mask = paddle.concat(cls_mask, axis=0)
        loc_mask = paddle.concat(loc_mask, axis=0)
        cls_targets = paddle.concat(cls_targets, axis=0)
        loc_targets = paddle.concat(loc_targets, axis=0)
        iou_targets = paddle.concat(iou_targets, axis=0)

        ### Training Weights and avg factor
        # find positives
        cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1)
        loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
        # cls weight
        cls_sample_weights = paddle.ones([cls_targets.shape[0]])
        cls_avg_factor = paddle.max(cls_targets[cls_pos_ind],
                                    axis=-1).sum().item()
        # loc weight
        loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1)
        loc_avg_factor = loc_sample_weights.sum().item()
        # iou weight
        iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]])
        iou_avg_factor = loc_pos_ind.shape[0]

        ### unsupervised loss
        # cls loss
        loss_cls = self.quality_focal_loss(
            stu_cls,
            cls_targets,
            quality=stu_iou,
            weights=cls_sample_weights,
            avg_factor=cls_avg_factor) * self.cls_weight
        # iou loss
        pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind]
        pos_iou_targets = iou_targets[loc_pos_ind]
        loss_iou = F.binary_cross_entropy(
            F.sigmoid(pos_stu_iou), pos_iou_targets,
            reduction='none') * iou_sample_weights
        loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight
        # box loss
        pos_stu_loc = stu_loc[loc_pos_ind]
        pos_loc_targets = loc_targets[loc_pos_ind]

        loss_box = self.iou_loss(
            pos_stu_loc,
            pos_loc_targets,
            weights=loc_sample_weights,
            avg_factor=loc_avg_factor)
        loss_box = loss_box * self.reg_weight

        loss_all = {
            "loss_cls": loss_cls,
            "loss_box": loss_box,
            "loss_iou": loss_iou,
        }
        return loss_all


================================================
FILE: ppdet/modeling/losses/focal_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from ppdet.core.workspace import register

__all__ = ['FocalLoss', 'Weighted_FocalLoss']

@register
class FocalLoss(nn.Layer):
    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
    Args:
        use_sigmoid (bool): currently only support use_sigmoid=True
        alpha (float): parameter alpha in Focal Loss
        gamma (float): parameter gamma in Focal Loss
        loss_weight (float): final loss will be multiplied by this
    """
    def __init__(self,
                 use_sigmoid=True,
                 alpha=0.25,
                 gamma=2.0,
                 loss_weight=1.0):
        super(FocalLoss, self).__init__()
        assert use_sigmoid == True, \
            'Focal Loss only supports sigmoid at the moment'
        self.use_sigmoid = use_sigmoid
        self.alpha = alpha
        self.gamma = gamma
        self.loss_weight = loss_weight

    def forward(self, pred, target, reduction='none'):
        """forward function.
        Args:
            pred (Tensor): logits of class prediction, of shape (N, num_classes)
            target (Tensor): target class label, of shape (N, )
            reduction (str): the way to reduce loss, one of (none, sum, mean)
        """
        num_classes = pred.shape[1]
        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
        target = target[:, :-1].detach()
        loss = F.sigmoid_focal_loss(
            pred, target, alpha=self.alpha, gamma=self.gamma,
            reduction=reduction)
        return loss * self.loss_weight


@register
class Weighted_FocalLoss(FocalLoss):
    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
    Args:
        use_sigmoid (bool): currently only support use_sigmoid=True
        alpha (float): parameter alpha in Focal Loss
        gamma (float): parameter gamma in Focal Loss
        loss_weight (float): final loss will be multiplied by this
    """
    def __init__(self,
                 use_sigmoid=True,
                 alpha=0.25,
                 gamma=2.0,
                 loss_weight=1.0,
                 reduction="mean"):
        super(FocalLoss, self).__init__()
        assert use_sigmoid == True, \
            'Focal Loss only supports sigmoid at the moment'
        self.use_sigmoid = use_sigmoid
        self.alpha = alpha
        self.gamma = gamma
        self.loss_weight = loss_weight
        self.reduction = reduction

    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
        """forward function.
        Args:
            pred (Tensor): logits of class prediction, of shape (N, num_classes)
            target (Tensor): target class label, of shape (N, )
            reduction (str): the way to reduce loss, one of (none, sum, mean)
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        num_classes = pred.shape[1]
        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
        target = target[:, :-1].detach()
        loss = F.sigmoid_focal_loss(
            pred, target, alpha=self.alpha, gamma=self.gamma,
            reduction='none')

        if weight is not None:
            if weight.shape != loss.shape:
                if weight.shape[0] == loss.shape[0]:
                    # For most cases, weight is of shape (num_priors, ),
                    #  which means it does not have the second axis num_class
                    weight = weight.reshape((-1, 1))
                else:
                    # Sometimes, weight per anchor per class is also needed. e.g.
                    #  in FSAF. But it may be flattened of shape
                    #  (num_priors x num_class, ), while loss is still of shape
                    #  (num_priors, num_class).
                    assert weight.numel() == loss.numel()
                    weight = weight.reshape((loss.shape[0], -1))
            assert weight.ndim == loss.ndim
            loss = loss * weight

        # if avg_factor is not specified, just reduce the loss
        if avg_factor is None:
            if reduction == 'mean':
                loss = loss.mean()
            elif reduction == 'sum':
                loss = loss.sum()
        else:
            # if reduction is mean, then average the loss by avg_factor
            if reduction == 'mean':
                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
                # i.e., all labels of an image belong to ignore index.
                eps = 1e-10
                loss = loss.sum() / (avg_factor + eps)
            # if reduction is 'none', then do nothing, otherwise raise an error
            elif reduction != 'none':
                raise ValueError('avg_factor can not be used with reduction="sum"')

        return loss * self.loss_weight


================================================
FILE: ppdet/modeling/losses/gfocal_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling import ops

__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']


def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
    """
    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
    Qualified and Distributed Bounding Boxes for Dense Object Detection
    <https://arxiv.org/abs/2006.04388>`_.
    Args:
        pred (Tensor): Predicted joint representation of classification
            and quality (IoU) estimation with shape (N, C), C is the number of
            classes.
        target (tuple([Tensor])): Target category label with shape (N,)
            and target quality label with shape (N,).
        beta (float): The beta parameter for calculating the modulating factor.
            Defaults to 2.0.
    Returns:
        Tensor: Loss tensor with shape (N,).
    """
    assert len(target) == 2, """target for QFL must be a tuple of two elements,
        including category label and quality label, respectively"""
    # label denotes the category id, score denotes the quality score
    label, score = target
    if use_sigmoid:
        func = F.binary_cross_entropy_with_logits
    else:
        func = F.binary_cross_entropy

    # negatives are supervised by 0 quality score
    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
    scale_factor = pred_sigmoid
    zerolabel = paddle.zeros(pred.shape, dtype='float32')
    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)

    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
    bg_class_ind = pred.shape[1]
    pos = paddle.logical_and((label >= 0),
                             (label < bg_class_ind)).nonzero().squeeze(1)
    if pos.shape[0] == 0:
        return loss.sum(axis=1)
    pos_label = paddle.gather(label, pos, axis=0)
    pos_mask = np.zeros(pred.shape, dtype=np.int32)
    pos_mask[pos.numpy(), pos_label.numpy()] = 1
    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
    # positives are supervised by bbox quality (IoU) score
    scale_factor_new = score - pred_sigmoid

    loss_pos = func(
        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
    loss = loss * paddle.logical_not(pos_mask).astype(loss.dtype) + loss_pos * pos_mask.astype(loss.dtype)
    loss = loss.sum(axis=1)
    return loss


def distribution_focal_loss(pred, label):
    """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
    Qualified and Distributed Bounding Boxes for Dense Object Detection
    <https://arxiv.org/abs/2006.04388>`_.
    Args:
        pred (Tensor): Predicted general distribution of bounding boxes
            (before softmax) with shape (N, n+1), n is the max value of the
            integral set `{0, ..., n}` in paper.
        label (Tensor): Target distance label for bounding boxes with
            shape (N,).
    Returns:
        Tensor: Loss tensor with shape (N,).
    """
    dis_left = label.cast('int64')
    dis_right = dis_left + 1
    weight_left = dis_right.cast('float32') - label
    weight_right = label - dis_left.cast('float32')
    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
    return loss


@register
@serializable
class QualityFocalLoss(nn.Layer):
    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
    Learning Qualified and Distributed Bounding Boxes for Dense Object
    Detection <https://arxiv.org/abs/2006.04388>`_.
    Args:
        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
            Defaults to True.
        beta (float): The beta parameter for calculating the modulating factor.
            Defaults to 2.0.
        reduction (str): Options are "none", "mean" and "sum".
        loss_weight (float): Loss weight of current loss.
    """

    def __init__(self,
                 use_sigmoid=True,
                 beta=2.0,
                 reduction='mean',
                 loss_weight=1.0):
        super(QualityFocalLoss, self).__init__()
        self.use_sigmoid = use_sigmoid
        self.beta = beta
        assert reduction in ('none', 'mean', 'sum')
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self, pred, target, weight=None, avg_factor=None):
        """Forward function.
        Args:
            pred (Tensor): Predicted joint representation of
                classification and quality (IoU) estimation with shape (N, C),
                C is the number of classes.
            target (tuple([Tensor])): Target category label with shape
                (N,) and target quality label with shape (N,).
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
        """

        loss = self.loss_weight * quality_focal_loss(
            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)

        if weight is not None:
            loss = loss * weight
        if avg_factor is None:
            if self.reduction == 'none':
                return loss
            elif self.reduction == 'mean':
                return loss.mean()
            elif self.reduction == 'sum':
                return loss.sum()
        else:
            # if reduction is mean, then average the loss by avg_factor
            if self.reduction == 'mean':
                loss = loss.sum() / avg_factor
            # if reduction is 'none', then do nothing, otherwise raise an error
            elif self.reduction != 'none':
                raise ValueError(
                    'avg_factor can not be used with reduction="sum"')
        return loss


@register
@serializable
class DistributionFocalLoss(nn.Layer):
    """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
    Learning Qualified and Distributed Bounding Boxes for Dense Object
    Detection <https://arxiv.org/abs/2006.04388>`_.
    Args:
        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
        loss_weight (float): Loss weight of current loss.
    """

    def __init__(self, reduction='mean', loss_weight=1.0):
        super(DistributionFocalLoss, self).__init__()
        assert reduction in ('none', 'mean', 'sum')
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self, pred, target, weight=None, avg_factor=None):
        """Forward function.
        Args:
            pred (Tensor): Predicted general distribution of bounding
                boxes (before softmax) with shape (N, n+1), n is the max value
                of the integral set `{0, ..., n}` in paper.
            target (Tensor): Target distance label for bounding boxes
                with shape (N,).
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
        """
        loss = self.loss_weight * distribution_focal_loss(pred, target)
        if weight is not None:
            loss = loss * weight
        if avg_factor is None:
            if self.reduction == 'none':
                return loss
            elif self.reduction == 'mean':
                return loss.mean()
            elif self.reduction == 'sum':
                return loss.sum()
        else:
            # if reduction is mean, then average the loss by avg_factor
            if self.reduction == 'mean':
                loss = loss.sum() / avg_factor
            # if reduction is 'none', then do nothing, otherwise raise an error
            elif self.reduction != 'none':
                raise ValueError(
                    'avg_factor can not be used with reduction="sum"')
        return loss


================================================
FILE: ppdet/modeling/losses/iou_aware_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from .iou_loss import IouLoss
from ..bbox_utils import bbox_iou


@register
@serializable
class IouAwareLoss(IouLoss):
    """
    iou aware loss, see https://arxiv.org/abs/1912.05992
    Args:
        loss_weight (float): iou aware loss weight, default is 1.0
        max_height (int): max height of input to support random shape input
        max_width (int): max width of input to support random shape input
    """

    def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False):
        super(IouAwareLoss, self).__init__(
            loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou)

    def __call__(self, ioup, pbox, gbox):
        iou = bbox_iou(
            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
        iou.stop_gradient = True
        loss_iou_aware = F.binary_cross_entropy_with_logits(
            ioup, iou, reduction='none')
        loss_iou_aware = loss_iou_aware * self.loss_weight
        return loss_iou_aware


================================================
FILE: ppdet/modeling/losses/iou_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import math
import paddle

from ppdet.core.workspace import register, serializable
from ..bbox_utils import bbox_iou

__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']


@register
@serializable
class IouLoss(object):
    """
    iou loss, see https://arxiv.org/abs/1908.03851
    loss = 1.0 - iou * iou
    Args:
        loss_weight (float): iou loss weight, default is 2.5
        max_height (int): max height of input to support random shape input
        max_width (int): max width of input to support random shape input
        ciou_term (bool): whether to add ciou_term
        loss_square (bool): whether to square the iou term
    """

    def __init__(self,
                 loss_weight=2.5,
                 giou=False,
                 diou=False,
                 ciou=False,
                 loss_square=True):
        self.loss_weight = loss_weight
        self.giou = giou
        self.diou = diou
        self.ciou = ciou
        self.loss_square = loss_square

    def __call__(self, pbox, gbox):
        iou = bbox_iou(
            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
        if self.loss_square:
            loss_iou = 1 - iou * iou
        else:
            loss_iou = 1 - iou

        loss_iou = loss_iou * self.loss_weight
        return loss_iou


@register
@serializable
class GIoULoss(object):
    """
    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
    Args:
        loss_weight (float): giou loss weight, default as 1
        eps (float): epsilon to avoid divide by zero, default as 1e-10
        reduction (string): Options are "none", "mean" and "sum". default as none
    """

    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
        self.loss_weight = loss_weight
        self.eps = eps
        assert reduction in ('none', 'mean', 'sum')
        self.reduction = reduction

    def bbox_overlap(self, box1, box2, eps=1e-10):
        """calculate the iou of box1 and box2
        Args:
            box1 (Tensor): box1 with the shape (..., 4)
            box2 (Tensor): box1 with the shape (..., 4)
            eps (float): epsilon to avoid divide by zero
        Return:
            iou (Tensor): iou of box1 and box2
            overlap (Tensor): overlap of box1 and box2
            union (Tensor): union of box1 and box2
        """
        x1, y1, x2, y2 = box1
        x1g, y1g, x2g, y2g = box2

        xkis1 = paddle.maximum(x1, x1g)
        ykis1 = paddle.maximum(y1, y1g)
        xkis2 = paddle.minimum(x2, x2g)
        ykis2 = paddle.minimum(y2, y2g)
        w_inter = (xkis2 - xkis1).clip(0)
        h_inter = (ykis2 - ykis1).clip(0)
        overlap = w_inter * h_inter

        area1 = (x2 - x1) * (y2 - y1)
        area2 = (x2g - x1g) * (y2g - y1g)
        union = area1 + area2 - overlap + eps
        iou = overlap / union

        return iou, overlap, union

    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
        box1 = [x1, y1, x2, y2]
        box2 = [x1g, y1g, x2g, y2g]
        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
        xc1 = paddle.minimum(x1, x1g)
        yc1 = paddle.minimum(y1, y1g)
        xc2 = paddle.maximum(x2, x2g)
        yc2 = paddle.maximum(y2, y2g)

        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
        miou = iou - ((area_c - union) / area_c)
        if loc_reweight is not None:
            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
            loc_thresh = 0.9
            giou = 1 - (1 - loc_thresh
                        ) * miou - loc_thresh * miou * loc_reweight
        else:
            giou = 1 - miou
        if self.reduction == 'none':
            loss = giou
        elif self.reduction == 'sum':
            loss = paddle.sum(giou * iou_weight)
        else:
            loss = paddle.mean(giou * iou_weight)
        return loss * self.loss_weight


@register
@serializable
class DIouLoss(GIoULoss):
    """
    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
    Args:
        loss_weight (float): giou loss weight, default as 1
        eps (float): epsilon to avoid divide by zero, default as 1e-10
        use_complete_iou_loss (bool): whether to use complete iou loss
    """

    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
        self.use_complete_iou_loss = use_complete_iou_loss

    def __call__(self, pbox, gbox, iou_weight=1.):
        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
        cx = (x1 + x2) / 2
        cy = (y1 + y2) / 2
        w = x2 - x1
        h = y2 - y1

        cxg = (x1g + x2g) / 2
        cyg = (y1g + y2g) / 2
        wg = x2g - x1g
        hg = y2g - y1g

        x2 = paddle.maximum(x1, x2)
        y2 = paddle.maximum(y1, y2)

        # A and B
        xkis1 = paddle.maximum(x1, x1g)
        ykis1 = paddle.maximum(y1, y1g)
        xkis2 = paddle.minimum(x2, x2g)
        ykis2 = paddle.minimum(y2, y2g)

        # A or B
        xc1 = paddle.minimum(x1, x1g)
        yc1 = paddle.minimum(y1, y1g)
        xc2 = paddle.maximum(x2, x2g)
        yc2 = paddle.maximum(y2, y2g)

        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
        intsctk = intsctk * paddle.greater_than(
            xkis2, xkis1).astype(intsctk.dtype) * paddle.greater_than(ykis2, ykis1).astype(intsctk.dtype)
        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
                                                        ) - intsctk + self.eps
        iouk = intsctk / unionk

        # DIOU term
        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)

        # CIOU term
        ciou_term = 0
        if self.use_complete_iou_loss:
            ar_gt = wg / hg
            ar_pred = w / h
            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
            ar_loss = 4. / np.pi / np.pi * arctan * arctan
            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
            alpha.stop_gradient = True
            ciou_term = alpha * ar_loss

        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)

        return diou * self.loss_weight


@register
@serializable
class SIoULoss(GIoULoss):
    """
    see https://arxiv.org/pdf/2205.12740.pdf 
    Args:
        loss_weight (float): siou loss weight, default as 1
        eps (float): epsilon to avoid divide by zero, default as 1e-10
        theta (float): default as 4
        reduction (str): Options are "none", "mean" and "sum". default as none
    """

    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
        self.loss_weight = loss_weight
        self.eps = eps
        self.theta = theta
        self.reduction = reduction

    def __call__(self, pbox, gbox):
        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)

        box1 = [x1, y1, x2, y2]
        box2 = [x1g, y1g, x2g, y2g]
        iou = bbox_iou(box1, box2)

        cx = (x1 + x2) / 2
        cy = (y1 + y2) / 2
        w = x2 - x1 + self.eps
        h = y2 - y1 + self.eps

        cxg = (x1g + x2g) / 2
        cyg = (y1g + y2g) / 2
        wg = x2g - x1g + self.eps
        hg = y2g - y1g + self.eps

        x2 = paddle.maximum(x1, x2)
        y2 = paddle.maximum(y1, y2)

        # A or B
        xc1 = paddle.minimum(x1, x1g)
        yc1 = paddle.minimum(y1, y1g)
        xc2 = paddle.maximum(x2, x2g)
        yc2 = paddle.maximum(y2, y2g)

        cw_out = xc2 - xc1
        ch_out = yc2 - yc1

        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)

        # angle cost
        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
        sin_angle_alpha = ch / dist_intersection
        sin_angle_beta = cw / dist_intersection
        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
        thred.stop_gradient = True
        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
                                 sin_angle_alpha)
        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)

        # distance cost
        gamma = 2 - angle_cost
        # gamma.stop_gradient = True
        beta_x = ((cxg - cx) / cw_out)**2
        beta_y = ((cyg - cy) / ch_out)**2
        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
                                                                     beta_y)

        # shape cost
        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
        omega = (1 - paddle.exp(-omega_w))**self.theta + (
            1 - paddle.exp(-omega_h))**self.theta
        siou_loss = 1 - iou + (omega + dist_cost) / 2

        if self.reduction == 'mean':
            siou_loss = paddle.mean(siou_loss)
        elif self.reduction == 'sum':
            siou_loss = paddle.sum(siou_loss)

        return siou_loss * self.loss_weight


================================================
FILE: ppdet/modeling/losses/jde_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss']


@register
class JDEDetectionLoss(nn.Layer):
    __shared__ = ['num_classes']

    def __init__(self, num_classes=1, for_mot=True):
        super(JDEDetectionLoss, self).__init__()
        self.num_classes = num_classes
        self.for_mot = for_mot

    def det_loss(self, p_det, anchor, t_conf, t_box):
        pshape = paddle.shape(p_det)
        pshape.stop_gradient = True
        nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1]
        nA = len(anchor)
        p_det = paddle.reshape(
            p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose(
                (0, 1, 3, 4, 2))

        # 1. loss_conf: cross_entropy
        p_conf = p_det[:, :, :, :, 4:6]
        p_conf_flatten = paddle.reshape(p_conf, [-1, 2])
        t_conf_flatten = t_conf.flatten()
        t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64")
        t_conf_flatten.stop_gradient = True
        loss_conf = F.cross_entropy(
            p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean')
        loss_conf.stop_gradient = False

        # 2. loss_box: smooth_l1_loss
        p_box = p_det[:, :, :, :, :4]
        p_box_flatten = paddle.reshape(p_box, [-1, 4])
        t_box_flatten = paddle.reshape(t_box, [-1, 4])
        fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten()
        if fg_inds.numel() > 0:
            reg_delta = paddle.gather(p_box_flatten, fg_inds)
            reg_target = paddle.gather(t_box_flatten, fg_inds)
        else:
            reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
            reg_delta.stop_gradient = False
            reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
        reg_target.stop_gradient = True
        loss_box = F.smooth_l1_loss(
            reg_delta, reg_target, reduction='mean', delta=1.0)
        loss_box.stop_gradient = False

        return loss_conf, loss_box

    def forward(self, det_outs, targets, anchors):
        """
        Args:
            det_outs (list[Tensor]): output from detection head, each one
                is a 4-D Tensor with shape [N, C, H, W].
            targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image',
                'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of
                each FPN level.
            anchors (list[list]): anchor setting of JDE model, N row M col, N is
                the anchor levels(FPN levels), M is the anchor scales each
                level.
        """
        assert len(det_outs) == len(anchors)
        loss_confs = []
        loss_boxes = []
        for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)):
            t_conf = targets['tconf{}'.format(i)]
            t_box = targets['tbox{}'.format(i)]

            loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box)
            loss_confs.append(loss_conf)
            loss_boxes.append(loss_box)
        if self.for_mot:
            return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes}
        else:
            jde_conf_losses = sum(loss_confs)
            jde_box_losses = sum(loss_boxes)
            jde_det_losses = {
                "loss_conf": jde_conf_losses,
                "loss_box": jde_box_losses,
                "loss": jde_conf_losses + jde_box_losses,
            }
            return jde_det_losses


@register
class JDEEmbeddingLoss(nn.Layer):
    def __init__(self, ):
        super(JDEEmbeddingLoss, self).__init__()
        self.phony = self.create_parameter(shape=[1], dtype="float32")

    def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier):
        emb_dim = p_ide.shape[1]
        p_ide = p_ide.transpose((0, 2, 3, 1))
        p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim])
        mask = t_conf > 0
        mask = paddle.cast(mask, dtype="int64")
        mask.stop_gradient = True
        emb_mask = mask.max(1).flatten()
        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
        emb_mask_inds.stop_gradient = True
        # use max(1) to decide the id, TODO: more reseanable strategy
        t_ide_flatten = t_ide.max(1).flatten()
        t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64")
        valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten()

        if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0:
            # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward
            loss_ide = self.phony * 0  # todo
        else:
            embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
            embedding = emb_scale * F.normalize(embedding)
            logits = classifier(embedding)

            ide_target = paddle.gather(t_ide_flatten, emb_mask_inds)

            loss_ide = F.cross_entropy(
                logits, ide_target, ignore_index=-1, reduction='mean')
        loss_ide.stop_gradient = False

        return loss_ide

    def forward(self, ide_outs, targets, emb_scale, classifier):
        loss_ides = []
        for i, p_ide in enumerate(ide_outs):
            t_conf = targets['tconf{}'.format(i)]
            t_ide = targets['tide{}'.format(i)]

            loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale,
                                     classifier)
            loss_ides.append(loss_ide)
        return loss_ides


@register
class JDELoss(nn.Layer):
    def __init__(self):
        super(JDELoss, self).__init__()

    def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls,
                loss_params_reg, loss_params_ide, targets):
        assert len(loss_confs) == len(loss_boxes) == len(loss_ides)
        assert len(loss_params_cls) == len(loss_params_reg) == len(
            loss_params_ide)
        assert len(loss_confs) == len(loss_params_cls)

        batchsize = targets['gt_bbox'].shape[0]
        nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[
            0] / batchsize
        nTargets = paddle.to_tensor(nTargets, dtype='float32')
        nTargets.stop_gradient = True

        jde_losses = []
        for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p,
                l_ide_p) in enumerate(
                    zip(loss_confs, loss_boxes, loss_ides, loss_params_cls,
                        loss_params_reg, loss_params_ide)):

            jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p(
                loss_ide)
            jde_losses.append(jde_loss)

        loss_all = {
            "loss_conf": sum(loss_confs),
            "loss_box": sum(loss_boxes),
            "loss_ide": sum(loss_ides),
            "loss": sum(jde_losses),
            "nTargets": nTargets,
        }
        return loss_all


================================================
FILE: ppdet/modeling/losses/keypoint_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from itertools import cycle, islice
from collections import abc
import numpy as np
import paddle
import paddle.nn as nn

from ppdet.core.workspace import register, serializable

__all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss']


@register
@serializable
class KeyPointMSELoss(nn.Layer):
    def __init__(self, use_target_weight=True, loss_scale=0.5):
        """
        KeyPointMSELoss layer

        Args:
            use_target_weight (bool): whether to use target weight
        """
        super(KeyPointMSELoss, self).__init__()
        self.criterion = nn.MSELoss(reduction='mean')
        self.use_target_weight = use_target_weight
        self.loss_scale = loss_scale

    def forward(self, output, records):
        target = records['target']
        target_weight = records['target_weight']
        batch_size = output.shape[0]
        num_joints = output.shape[1]
        heatmaps_pred = output.reshape(
            (batch_size, num_joints, -1)).split(num_joints, 1)
        heatmaps_gt = target.reshape(
            (batch_size, num_joints, -1)).split(num_joints, 1)
        loss = 0
        for idx in range(num_joints):
            heatmap_pred = heatmaps_pred[idx].squeeze()
            heatmap_gt = heatmaps_gt[idx].squeeze()
            if self.use_target_weight:
                loss += self.loss_scale * self.criterion(
                    heatmap_pred.multiply(target_weight[:, idx]),
                    heatmap_gt.multiply(target_weight[:, idx]))
            else:
                loss += self.loss_scale * self.criterion(heatmap_pred,
                                                         heatmap_gt)
        keypoint_losses = dict()
        keypoint_losses['loss'] = loss / num_joints
        return keypoint_losses


@register
@serializable
class HrHRNetLoss(nn.Layer):
    def __init__(self, num_joints, swahr):
        """
        HrHRNetLoss layer

        Args:
            num_joints (int): number of keypoints
        """
        super(HrHRNetLoss, self).__init__()
        if swahr:
            self.heatmaploss = HeatMapSWAHRLoss(num_joints)
        else:
            self.heatmaploss = HeatMapLoss()
        self.aeloss = AELoss()
        self.ziploss = ZipLoss(
            [self.heatmaploss, self.heatmaploss, self.aeloss])

    def forward(self, inputs, records):
        targets = []
        targets.append([records['heatmap_gt1x'], records['mask_1x']])
        targets.append([records['heatmap_gt2x'], records['mask_2x']])
        targets.append(records['tagmap'])
        keypoint_losses = dict()
        loss = self.ziploss(inputs, targets)
        keypoint_losses['heatmap_loss'] = loss[0] + loss[1]
        keypoint_losses['pull_loss'] = loss[2][0]
        keypoint_losses['push_loss'] = loss[2][1]
        keypoint_losses['loss'] = recursive_sum(loss)
        return keypoint_losses


class HeatMapLoss(object):
    def __init__(self, loss_factor=1.0):
        super(HeatMapLoss, self).__init__()
        self.loss_factor = loss_factor

    def __call__(self, preds, targets):
        heatmap, mask = targets
        loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1))
        loss = paddle.clip(loss, min=0, max=2).mean()
        loss *= self.loss_factor
        return loss


class HeatMapSWAHRLoss(object):
    def __init__(self, num_joints, loss_factor=1.0):
        super(HeatMapSWAHRLoss, self).__init__()
        self.loss_factor = loss_factor
        self.num_joints = num_joints

    def __call__(self, preds, targets):
        heatmaps_gt, mask = targets
        heatmaps_pred = preds[0]
        scalemaps_pred = preds[1]

        heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * (
            1 + (1 +
                 (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2),
                                          heatmaps_gt)

        regularizer_loss = paddle.mean(
            paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float),
                       2))
        omiga = 0.01
        # thres = 2**(-1/omiga), threshold for positive weight
        hm_weight = heatmaps_scaled_gt**(
            omiga
        ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * (
            1 - heatmaps_scaled_gt**(omiga))

        loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) *
                mask.cast('float').unsqueeze(1)) * hm_weight
        loss = loss.mean()
        loss = self.loss_factor * (loss + 1.0 * regularizer_loss)
        return loss


class AELoss(object):
    def __init__(self, pull_factor=0.001, push_factor=0.001):
        super(AELoss, self).__init__()
        self.pull_factor = pull_factor
        self.push_factor = push_factor

    def apply_single(self, pred, tagmap):
        if tagmap.numpy()[:, :, 3].sum() == 0:
            return (paddle.zeros([1]), paddle.zeros([1]))
        nonzero = paddle.nonzero(tagmap[:, :, 3] > 0)
        if nonzero.shape[0] == 0:
            return (paddle.zeros([1]), paddle.zeros([1]))
        p_inds = paddle.unique(nonzero[:, 0])
        num_person = p_inds.shape[0]
        if num_person == 0:
            return (paddle.zeros([1]), paddle.zeros([1]))

        pull = 0
        tagpull_num = 0
        embs_all = []
        person_unvalid = 0
        for person_idx in p_inds.numpy():
            valid_single = tagmap[person_idx.item()]
            validkpts = paddle.nonzero(valid_single[:, 3] > 0)
            valid_single = paddle.index_select(valid_single, validkpts)
            emb = paddle.gather_nd(pred, valid_single[:, :3])
            if emb.shape[0] == 1:
                person_unvalid += 1
            mean = paddle.mean(emb, axis=0)
            embs_all.append(mean)
            pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0)
            tagpull_num += emb.shape[0]
        pull /= max(num_person - person_unvalid, 1)
        if num_person < 2:
            return pull, paddle.zeros([1])

        embs_all = paddle.stack(embs_all)
        A = embs_all.expand([num_person, num_person])
        B = A.transpose([1, 0])
        diff = A - B

        diff = paddle.pow(diff, 2)
        push = paddle.exp(-diff)
        push = paddle.sum(push) - num_person

        push /= 2 * num_person * (num_person - 1)
        return pull, push

    def __call__(self, preds, tagmaps):
        bs = preds.shape[0]
        losses = [
            self.apply_single(preds[i:i + 1].squeeze(),
                              tagmaps[i:i + 1].squeeze()) for i in range(bs)
        ]
        pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses)
        push = self.push_factor * sum(loss[1] for loss in losses) / len(losses)
        return pull, push


class ZipLoss(object):
    def __init__(self, loss_funcs):
        super(ZipLoss, self).__init__()
        self.loss_funcs = loss_funcs

    def __call__(self, inputs, targets):
        assert len(self.loss_funcs) == len(targets) >= len(inputs)

        def zip_repeat(*args):
            longest = max(map(len, args))
            filled = [islice(cycle(x), longest) for x in args]
            return zip(*filled)

        return tuple(
            fn(x, y)
            for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs))


def recursive_sum(inputs):
    if isinstance(inputs, abc.Sequence):
        return sum([recursive_sum(x) for x in inputs])
    return inputs


def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
    if not kpt_gts.astype('bool').any():
        return kpt_preds.sum()*0
    
    sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype)
    variances = (sigmas * 2)**2

    assert kpt_preds.shape[0] == kpt_gts.shape[0]
    kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2))
    kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2))

    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
        (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
    assert (kpt_valids.sum(-1) > 0).all()
    squared_distance0 = squared_distance / (
        kpt_areas[:, None] * variances[None, :] * 2)
    squared_distance1 = paddle.exp(-squared_distance0)
    squared_distance1 = squared_distance1 * kpt_valids
    oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1)

    return oks


def oks_loss(pred,
             target,
             weight,
             valid=None,
             area=None,
             linear=False,
             sigmas=None,
             eps=1e-6,
             avg_factor=None, 
             reduction=None):
    """Oks loss.

    Computing the oks loss between a set of predicted poses and target poses.
    The loss is calculated as negative log of oks.

    Args:
        pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
            shape (n, K*2).
        target (Tensor): Corresponding gt poses, shape (n, K*2).
        linear (bool, optional): If True, use linear scale of loss instead of
            log scale. Default: False.
        eps (float): Eps to avoid log(0).

    Returns:
        Tensor: Loss tensor.
    """
    oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps)
    if linear:
        loss = 1 - oks
    else:
        loss = -oks.log()

    if weight is not None:
        if weight.shape != loss.shape:
            if weight.shape[0] == loss.shape[0]:
                # For most cases, weight is of shape (num_priors, ),
                #  which means it does not have the second axis num_class
                weight = weight.reshape((-1, 1))
            else:
                # Sometimes, weight per anchor per class is also needed. e.g.
                #  in FSAF. But it may be flattened of shape
                #  (num_priors x num_class, ), while loss is still of shape
                #  (num_priors, num_class).
                assert weight.numel() == loss.numel()
                weight = weight.reshape((loss.shape[0], -1))
        assert weight.ndim == loss.ndim
        loss = loss * weight

    # if avg_factor is not specified, just reduce the loss
    if avg_factor is None:
        if reduction == 'mean':
            loss = loss.mean()
        elif reduction == 'sum':
            loss = loss.sum()
    else:
        # if reduction is mean, then average the loss by avg_factor
        if reduction == 'mean':
            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
            # i.e., all labels of an image belong to ignore index.
            eps = 1e-10
            loss = loss.sum() / (avg_factor + eps)
        # if reduction is 'none', then do nothing, otherwise raise an error
        elif reduction != 'none':
            raise ValueError('avg_factor can not be used with reduction="sum"')


    return loss

@register
@serializable
class OKSLoss(nn.Layer):
    """OKSLoss.

    Computing the oks loss between a set of predicted poses and target poses.

    Args:
        linear (bool): If True, use linear scale of loss instead of log scale.
            Default: False.
        eps (float): Eps to avoid log(0).
        reduction (str): Options are "none", "mean" and "sum".
        loss_weight (float): Weight of loss.
    """

    def __init__(self,
                 linear=False,
                 num_keypoints=17,
                 eps=1e-6,
                 reduction='mean',
                 loss_weight=1.0):
        super(OKSLoss, self).__init__()
        self.linear = linear
        self.eps = eps
        self.reduction = reduction
        self.loss_weight = loss_weight
        if num_keypoints == 17:
            self.sigmas = np.array([
                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
                1.07, .87, .87, .89, .89
            ], dtype=np.float32) / 10.0
        elif num_keypoints == 14:
            self.sigmas = np.array([
                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
                .79, .79
            ]) / 10.0
        else:
            raise ValueError(f'Unsupported keypoints number {num_keypoints}')

    def forward(self,
                pred,
                target,
                valid,
                area,
                weight=None,
                avg_factor=None,
                reduction_override=None,
                **kwargs):
        """Forward function.

        Args:
            pred (Tensor): The prediction.
            target (Tensor): The learning target of the prediction.
            valid (Tensor): The visible flag of the target pose.
            area (Tensor): The area of the target pose.
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None. Options are "none", "mean" and "sum".
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        if (weight is not None) and (not paddle.any(weight > 0)) and (
                reduction != 'none'):
            if pred.dim() == weight.dim() + 1:
                weight = weight.unsqueeze(1)
            return (pred * weight).sum()  # 0
        if weight is not None and weight.dim() > 1:
            # TODO: remove this in the future
            # reduce the weight of shape (n, 4) to (n,) to match the
            # iou_loss of shape (n,)
            assert weight.shape == pred.shape
            weight = weight.mean(-1)
        loss = self.loss_weight * oks_loss(
            pred,
            target,
            weight,
            valid=valid,
            area=area,
            linear=self.linear,
            sigmas=self.sigmas,
            eps=self.eps,
            reduction=reduction,
            avg_factor=avg_factor,
            **kwargs)
        return loss


def center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None):
    """Modified focal loss. Exactly the same as CornerNet.
    Runs faster and costs a little bit more memory.

    Args:
        pred (Tensor): The prediction with shape [bs, c, h, w].
        gt (Tensor): The learning target of the prediction in gaussian
            distribution, with shape [bs, c, h, w].
        mask (Tensor): The valid mask. Defaults to None.
    """
    if not gt.astype('bool').any():
        return pred.sum()*0
    pos_inds = gt.equal(1).astype('float32')
    if mask is None:
        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32')
    else:
        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32')

    neg_weights = paddle.pow(1 - gt, 4)

    loss = 0

    pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds
    neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \
        neg_inds

    num_pos = pos_inds.astype('float32').sum()
    pos_loss = pos_loss.sum()
    neg_loss = neg_loss.sum()

    if num_pos == 0:
        loss = loss - neg_loss
    else:
        loss = loss - (pos_loss + neg_loss) / num_pos

    if weight is not None:
        if weight.shape != loss.shape:
            if weight.shape[0] == loss.shape[0]:
                # For most cases, weight is of shape (num_priors, ),
                #  which means it does not have the second axis num_class
                weight = weight.reshape((-1, 1))
            else:
                # Sometimes, weight per anchor per class is also needed. e.g.
                #  in FSAF. But it may be flattened of shape
                #  (num_priors x num_class, ), while loss is still of shape
                #  (num_priors, num_class).
                assert weight.numel() == loss.numel()
                weight = weight.reshape((loss.shape[0], -1))
        assert weight.ndim == loss.ndim
        loss = loss * weight

    # if avg_factor is not specified, just reduce the loss
    if avg_factor is None:
        if reduction == 'mean':
            loss = loss.mean()
        elif reduction == 'sum':
            loss = loss.sum()
    else:
        # if reduction is mean, then average the loss by avg_factor
        if reduction == 'mean':
            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
            # i.e., all labels of an image belong to ignore index.
            eps = 1e-10
            loss = loss.sum() / (avg_factor + eps)
        # if reduction is 'none', then do nothing, otherwise raise an error
        elif reduction != 'none':
            raise ValueError('avg_factor can not be used with reduction="sum"')

    return loss

@register
@serializable
class CenterFocalLoss(nn.Layer):
    """CenterFocalLoss is a variant of focal loss.

    More details can be found in the `paper
    <https://arxiv.org/abs/1808.01244>`_

    Args:
        reduction (str): Options are "none", "mean" and "sum".
        loss_weight (float): Loss weight of current loss.
    """

    def __init__(self,
                 reduction='none',
                 loss_weight=1.0):
        super(CenterFocalLoss, self).__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                pred,
                target,
                weight=None,
                mask=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.

        Args:
            pred (Tensor): The prediction.
            target (Tensor): The learning target of the prediction in gaussian
                distribution.
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            mask (Tensor): The valid mask. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None.
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        loss_reg = self.loss_weight * center_focal_loss(
            pred,
            target,
            weight,
            mask=mask,
            reduction=reduction,
            avg_factor=avg_factor)
        return loss_reg

def l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None):
    """L1 loss.

    Args:
        pred (Tensor): The prediction.
        target (Tensor): The learning target of the prediction.

    Returns:
        Tensor: Calculated loss
    """
    if not target.astype('bool').any():
        return pred.sum() * 0

    assert pred.shape == target.shape
    loss = paddle.abs(pred - target)

    if weight is not None:
        if weight.shape != loss.shape:
            if weight.shape[0] == loss.shape[0]:
                # For most cases, weight is of shape (num_priors, ),
                #  which means it does not have the second axis num_class
                weight = weight.reshape((-1, 1))
            else:
                # Sometimes, weight per anchor per class is also needed. e.g.
                #  in FSAF. But it may be flattened of shape
                #  (num_priors x num_class, ), while loss is still of shape
                #  (num_priors, num_class).
                assert weight.numel() == loss.numel()
                weight = weight.reshape((loss.shape[0], -1))
        assert weight.ndim == loss.ndim
        loss = loss * weight

    # if avg_factor is not specified, just reduce the loss
    if avg_factor is None:
        if reduction == 'mean':
            loss = loss.mean()
        elif reduction == 'sum':
            loss = loss.sum()
    else:
        # if reduction is mean, then average the loss by avg_factor
        if reduction == 'mean':
            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
            # i.e., all labels of an image belong to ignore index.
            eps = 1e-10
            loss = loss.sum() / (avg_factor + eps)
        # if reduction is 'none', then do nothing, otherwise raise an error
        elif reduction != 'none':
            raise ValueError('avg_factor can not be used with reduction="sum"')


    return loss

@register
@serializable
class L1Loss(nn.Layer):
    """L1 loss.

    Args:
        reduction (str, optional): The method to reduce the loss.
            Options are "none", "mean" and "sum".
        loss_weight (float, optional): The weight of loss.
    """

    def __init__(self, reduction='mean', loss_weight=1.0):
        super(L1Loss, self).__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                pred,
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.

        Args:
            pred (Tensor): The prediction.
            target (Tensor): The learning target of the prediction.
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None.
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        loss_bbox = self.loss_weight * l1_loss(
            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
        return loss_bbox


================================================
FILE: ppdet/modeling/losses/pose3d_loss.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from itertools import cycle, islice
from collections import abc
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')

__all__ = ['Pose3DLoss']


@register
@serializable
class Pose3DLoss(nn.Layer):
    def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'):
        """
        KeyPointMSELoss layer

        Args:
            weight_3d (float): weight of 3d loss
            weight_2d (float): weight of 2d loss
            reduction (bool): whether use reduction to loss
        """
        super(Pose3DLoss, self).__init__()
        self.weight_3d = weight_3d
        self.weight_2d = weight_2d
        self.criterion_2dpose = nn.MSELoss(reduction=reduction)
        self.criterion_3dpose = nn.L1Loss(reduction=reduction)
        self.criterion_smoothl1 = nn.SmoothL1Loss(
            reduction=reduction, delta=1.0)
        self.criterion_vertices = nn.L1Loss()

    def forward(self, pred3d, pred2d, inputs):
        """
        mpjpe: mpjpe loss between 3d joints
        keypoint_2d_loss: 2d joints loss compute by criterion_2dpose
        """
        gt_3d_joints = inputs['joints_3d']
        gt_2d_joints = inputs['joints_2d']
        has_3d_joints = inputs['has_3d_joints']
        has_2d_joints = inputs['has_2d_joints']

        loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints)
        loss = self.weight_3d * loss_3d
        epoch = inputs['epoch_id']
        if self.weight_2d > 0:
            weight = self.weight_2d * pow(0.1, (epoch // 8))
            if epoch > 8:
                weight = 0
            loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d,
                                       gt_2d_joints, has_2d_joints)
            loss += weight * loss_2d
        return loss


def filter_3d_joints(pred, gt, has_3d_joints):
    """ 
    filter 3d joints
    """
    gt = gt[has_3d_joints == 1]
    gt = gt[:, :, :3]
    pred = pred[has_3d_joints == 1]

    gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
    gt = gt - gt_pelvis[:, None, :]
    pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
    pred = pred - pred_pelvis[:, None, :]
    return pred, gt


def mpjpe(pred, gt, has_3d_joints):
    """ 
    mPJPE loss
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2
                         ).sum(axis=-1)).mean()
    return error


def mpjpe_focal(pred, gt, has_3d_joints):
    """ 
    mPJPE loss
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    mse_error = ((pred - gt)**2).sum(axis=-1)
    mpjpe_error = paddle.sqrt(mse_error)
    mean = mpjpe_error.mean()
    std = mpjpe_error.std()
    atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std)
    mse_error *= atte
    return mse_error.mean()


def mpjpe_mse(pred, gt, has_3d_joints, weight=1.):
    """ 
    mPJPE loss
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    error = (((pred - gt)**2).sum(axis=-1)).mean()
    return error


def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d):
    """ 
    mPJPE loss of self define criterion
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    error = paddle.sqrt(criterion_pose3d(pred, gt)).mean()
    return error


@register
@serializable
def weighted_mpjpe(pred, gt, has_3d_joints):
    """ 
    Weighted_mPJPE
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    weight = paddle.linalg.norm(pred, p=2, axis=-1)
    weight = paddle.to_tensor(
        [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.])
    error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean()
    return error


@register
@serializable
def normed_mpjpe(pred, gt, has_3d_joints):
    """
    Normalized MPJPE (scale only), adapted from:
    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py
    """
    assert pred.shape == gt.shape
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)

    norm_predicted = paddle.mean(
        paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True)
    norm_target = paddle.mean(
        paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True)
    scale = norm_target / norm_predicted
    return mpjpe(scale * pred, gt)


@register
@serializable
def mpjpe_np(pred, gt, has_3d_joints):
    """ 
    mPJPE_NP
    """
    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
    error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
    return error


@register
@serializable
def mean_per_vertex_error(pred, gt, has_smpl):
    """
    Compute mPVE
    """
    pred = pred[has_smpl == 1]
    gt = gt[has_smpl == 1]
    with paddle.no_grad():
        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
        return error


@register
@serializable
def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d,
                     has_pose_2d):
    """
    Compute 2D reprojection loss if 2D keypoint annotations are available.
    The confidence (conf) is binary and indicates whether the keypoints exist or not.
    """
    conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone()
    loss = (conf * criterion_keypoints(
        pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean()
    return loss


@register
@serializable
def keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d,
                     has_pose_3d):
    """
    Compute 3D keypoint loss if 3D keypoint annotations are available.
    """
    conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone()
    gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone()
    gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1]
    conf = conf[has_pose_3d == 1]
    pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1]
    if len(gt_keypoints_3d) > 0:
        gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2
        gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :]
        pred_pelvis = (
            pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2
        pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :]
        return (conf * criterion_keypoints(pred_keypoints_3d,
                                           gt_keypoints_3d)).mean()
    else:
        return paddle.to_tensor([1.]).fill_(0.)


@register
@serializable
def vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl):
    """
    Compute per-vertex loss if vertex annotations are available.
    """
    pred_vertices_with_shape = pred_vertices[has_smpl == 1]
    gt_vertices_with_shape = gt_vertices[has_smpl == 1]
    if len(gt_vertices_with_shape) > 0:
        return criterion_vertices(pred_vertices_with_shape,
                                  gt_vertices_with_shape)
    else:
        return paddle.to_tensor([1.]).fill_(0.)


@register
@serializable
def rectify_pose(pose):
    pose = pose.copy()
    R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0]
    R_root = cv2.Rodrigues(pose[:3])[0]
    new_root = R_root.dot(R_mod)
    pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3)
    return pose


================================================
FILE: ppdet/modeling/losses/probiou_loss.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import paddle
import paddle.nn.functional as F

from ppdet.core.workspace import register, serializable

__all__ = ['ProbIoULoss']


def gbb_form(boxes):
    xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1)
    return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1)


def rotated_form(a_, b_, angles):
    cos_a = paddle.cos(angles)
    sin_a = paddle.sin(angles)
    a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2)
    b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2)
    c = (a_ - b_) * cos_a * sin_a
    return a, b, c


def probiou_loss(pred, target, eps=1e-3, mode='l1'):
    """
        pred    -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0
        target  -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target    box ;in case of HBB angle == 0
        eps     -> threshold to avoid infinite values
        mode    -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper

    """

    gbboxes1 = gbb_form(pred)
    gbboxes2 = gbb_form(target)

    x1, y1, a1_, b1_, c1_ = gbboxes1[:,
                                     0], gbboxes1[:,
                                                  1], gbboxes1[:,
                                                               2], gbboxes1[:,
                                                                            3], gbboxes1[:,
                                                                                         4]
    x2, y2, a2_, b2_, c2_ = gbboxes2[:,
                                     0], gbboxes2[:,
                                                  1], gbboxes2[:,
                                                               2], gbboxes2[:,
                                                                            3], gbboxes2[:,
                                                                                         4]

    a1, b1, c1 = rotated_form(a1_, b1_, c1_)
    a2, b2, c2 = rotated_form(a2_, b2_, c2_)

    t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \
         0.5 * ((c1+c2)*(x2-x1)*(y1-y2))
    t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2)
    t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2)
    t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps))

    B_d = (t1 / t2) + t3
    # B_d = t1 + t2 + t3

    B_d = paddle.clip(B_d, min=eps, max=100.0)
    l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps)
    l_i = paddle.pow(l1, 2.0)
    l2 = -paddle.log(1.0 - l_i + eps)

    if mode == 'l1':
        probiou = l1
    if mode == 'l2':
        probiou = l2

    return probiou


@serializable
@register
class ProbIoULoss(object):
    """ ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details """

    def __init__(self, mode='l1', eps=1e-3):
        super(ProbIoULoss, self).__init__()
        self.mode = mode
        self.eps = eps

    def __call__(self, pred_rboxes, assigned_rboxes):
        return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode)


================================================
FILE: ppdet/modeling/losses/queryinst_loss.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ppdet.modeling.losses.iou_loss import GIoULoss
from .sparsercnn_loss import HungarianMatcher

__all__ = ['QueryInstLoss']


@register
class QueryInstLoss(object):
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 focal_loss_alpha=0.25,
                 focal_loss_gamma=2.0,
                 class_weight=2.0,
                 l1_weight=5.0,
                 giou_weight=2.0,
                 mask_weight=8.0):
        super(QueryInstLoss, self).__init__()

        self.num_classes = num_classes
        self.focal_loss_alpha = focal_loss_alpha
        self.focal_loss_gamma = focal_loss_gamma
        self.loss_weights = {
            "loss_cls": class_weight,
            "loss_bbox": l1_weight,
            "loss_giou": giou_weight,
            "loss_mask": mask_weight
        }
        self.giou_loss = GIoULoss(eps=1e-6, reduction='sum')

        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
                                        class_weight, l1_weight, giou_weight)

    def loss_classes(self, class_logits, targets, indices, avg_factor):
        tgt_labels = paddle.full(
            class_logits.shape[:2], self.num_classes, dtype='int32')

        if sum(len(v['labels']) for v in targets) > 0:
            tgt_classes = paddle.concat([
                paddle.gather(
                    tgt['labels'], tgt_idx, axis=0)
                for tgt, (_, tgt_idx) in zip(targets, indices)
            ])
            batch_idx, src_idx = self._get_src_permutation_idx(indices)
            for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)):
                tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i]

        tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1)

        tgt_labels_onehot = paddle.cast(
            tgt_labels == paddle.arange(0, self.num_classes), dtype='float32')
        tgt_labels_onehot.stop_gradient = True

        src_logits = class_logits.flatten(0, 1)

        loss_cls = F.sigmoid_focal_loss(
            src_logits,
            tgt_labels_onehot,
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction='sum') / avg_factor
        losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']}
        return losses

    def loss_bboxes(self, bbox_pred, targets, indices, avg_factor):
        bboxes = paddle.concat([
            paddle.gather(
                src, src_idx, axis=0)
            for src, (src_idx, _) in zip(bbox_pred, indices)
        ])

        tgt_bboxes = paddle.concat([
            paddle.gather(
                tgt['boxes'], tgt_idx, axis=0)
            for tgt, (_, tgt_idx) in zip(targets, indices)
        ])
        tgt_bboxes.stop_gradient = True

        im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets])
        bboxes_norm = bboxes / im_shapes
        tgt_bboxes_norm = tgt_bboxes / im_shapes

        loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor
        loss_bbox = F.l1_loss(
            bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor
        losses = {
            'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'],
            'loss_giou': loss_giou * self.loss_weights['loss_giou']
        }
        return losses

    def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices,
                   avg_factor):
        tgt_segm = [
            paddle.gather(
                tgt['gt_segm'], tgt_idx, axis=0)
            for tgt, (_, tgt_idx) in zip(targets, indices)
        ]

        tgt_masks = []
        for i in range(len(indices)):
            gt_segm = tgt_segm[i].unsqueeze(1)
            if len(gt_segm) == 0:
                continue
            boxes = pos_bbox_pred[i]
            boxes[:, 0::2] = paddle.clip(
                boxes[:, 0::2], min=0, max=gt_segm.shape[3])
            boxes[:, 1::2] = paddle.clip(
                boxes[:, 1::2], min=0, max=gt_segm.shape[2])
            boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32')
            gt_mask = paddle.vision.ops.roi_align(
                gt_segm,
                boxes,
                boxes_num,
                output_size=mask_logits.shape[-2:],
                aligned=True)
            tgt_masks.append(gt_mask)
        tgt_masks = paddle.concat(tgt_masks).squeeze(1)
        tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32')
        tgt_masks.stop_gradient = True

        tgt_labels = paddle.concat([
            paddle.gather(
                tgt['labels'], tgt_idx, axis=0)
            for tgt, (_, tgt_idx) in zip(targets, indices)
        ])

        mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3])
        mask_label = paddle.expand_as(mask_label, mask_logits)
        mask_label.stop_gradient = True

        src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
        shape = mask_logits.shape
        src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]])
        src_masks = F.sigmoid(src_masks)

        X = src_masks.flatten(1)
        Y = tgt_masks.flatten(1)
        inter = paddle.sum(X * Y, 1)
        union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1)
        dice = (2 * inter) / (union + 2e-5)

        loss_mask = (1 - dice).sum() / avg_factor
        losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']}
        return losses

    @staticmethod
    def _get_src_permutation_idx(indices):
        batch_idx = paddle.concat(
            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
        src_idx = paddle.concat([src for (src, _) in indices])
        return batch_idx, src_idx


================================================
FILE: ppdet/modeling/losses/smooth_l1_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

__all__ = ['SmoothL1Loss']

@register
class SmoothL1Loss(nn.Layer):
    """Smooth L1 Loss.
    Args:
        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
        loss_weight (float): the final loss will be multiplied by this 
    """
    def __init__(self,
                 beta=1.0,
                 loss_weight=1.0):
        super(SmoothL1Loss, self).__init__()
        assert beta >= 0
        self.beta = beta
        self.loss_weight = loss_weight

    def forward(self, pred, target, reduction='none'):
        """forward function, based on fvcore.
        Args:
            pred (Tensor): prediction tensor
            target (Tensor): target tensor, pred.shape must be the same as target.shape
            reduction (str): the way to reduce loss, one of (none, sum, mean)
        """
        assert reduction in ('none', 'sum', 'mean')
        target = target.detach()
        if self.beta < 1e-5:
            loss = paddle.abs(pred - target)
        else:
            n = paddle.abs(pred - target)
            cond = n < self.beta
            loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
        if reduction == 'mean':
            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
        elif reduction == 'sum':
            loss = loss.sum()
        return loss * self.loss_weight


================================================
FILE: ppdet/modeling/losses/solov2_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable

__all__ = ['SOLOv2Loss']


@register
@serializable
class SOLOv2Loss(object):
    """
    SOLOv2Loss
    Args:
        ins_loss_weight (float): Weight of instance loss.
        focal_loss_gamma (float): Gamma parameter for focal loss.
        focal_loss_alpha (float): Alpha parameter for focal loss.
    """

    def __init__(self,
                 ins_loss_weight=3.0,
                 focal_loss_gamma=2.0,
                 focal_loss_alpha=0.25):
        self.ins_loss_weight = ins_loss_weight
        self.focal_loss_gamma = focal_loss_gamma
        self.focal_loss_alpha = focal_loss_alpha

    def _dice_loss(self, input, target):
        input = paddle.reshape(input, shape=(input.shape[0], -1))
        target = paddle.reshape(target, shape=(target.shape[0], -1))
        a = paddle.sum(input * target, axis=1)
        b = paddle.sum(input * input, axis=1) + 0.001
        c = paddle.sum(target * target, axis=1) + 0.001
        d = (2 * a) / (b + c)
        return 1 - d

    def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels,
                 num_ins):
        """
        Get loss of network of SOLOv2.
        Args:
            ins_pred_list (list): Variable list of instance branch output.
            ins_label_list (list): List of instance labels pre batch.
            cate_preds (list): Concat Variable list of categroy branch output.
            cate_labels (list): Concat list of categroy labels pre batch.
            num_ins (int): Number of positive samples in a mini-batch.
        Returns:
            loss_ins (Variable): The instance loss Variable of SOLOv2 network.
            loss_cate (Variable): The category loss Variable of SOLOv2 network.
        """

        #1. Ues dice_loss to calculate instance loss
        loss_ins = []
        total_weights = paddle.zeros(shape=[1], dtype='float32')
        for input, target in zip(ins_pred_list, ins_label_list):
            if input is None:
                continue
            target = paddle.cast(target, 'float32')
            target = paddle.reshape(
                target,
                shape=[-1, input.shape[-2], input.shape[-1]])
            weights = paddle.cast(
                paddle.sum(target, axis=[1, 2]) > 0, 'float32')
            input = F.sigmoid(input)
            dice_out = paddle.multiply(self._dice_loss(input, target), weights)
            total_weights += paddle.sum(weights)
            loss_ins.append(dice_out)
        loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights
        loss_ins = loss_ins * self.ins_loss_weight

        #2. Ues sigmoid_focal_loss to calculate category loss
        # expand onehot labels
        num_classes = cate_preds.shape[-1]
        cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1)
        cate_labels_bin = cate_labels_bin[:, 1:]

        loss_cate = F.sigmoid_focal_loss(
            cate_preds,
            label=cate_labels_bin,
            normalizer=num_ins + 1.,
            gamma=self.focal_loss_gamma,
            alpha=self.focal_loss_alpha)

        return loss_ins, loss_cate


================================================
FILE: ppdet/modeling/losses/sparsercnn_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py
Ths copyright of PeizeSun/SparseR-CNN is as follows:
MIT License [see LICENSE for details]
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from scipy.optimize import linear_sum_assignment
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.metric import accuracy
from ppdet.core.workspace import register
from ppdet.modeling.losses.iou_loss import GIoULoss

__all__ = ["SparseRCNNLoss"]


@register
class SparseRCNNLoss(nn.Layer):
    """ This class computes the loss for SparseRCNN.
    The process happens in two steps:
        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
    """
    __shared__ = ['num_classes']

    def __init__(self,
                 losses,
                 focal_loss_alpha,
                 focal_loss_gamma,
                 num_classes=80,
                 class_weight=2.,
                 l1_weight=5.,
                 giou_weight=2.):
        """ Create the criterion.
        Parameters:
            num_classes: number of object categories, omitting the special no-object category
            weight_dict: dict containing as key the names of the losses and as values their relative weight.
            losses: list of all the losses to be applied. See get_loss for list of available losses.
            matcher: module able to compute a matching between targets and proposals
        """
        super().__init__()
        self.num_classes = num_classes
        weight_dict = {
            "loss_ce": class_weight,
            "loss_bbox": l1_weight,
            "loss_giou": giou_weight
        }
        self.weight_dict = weight_dict
        self.losses = losses
        self.giou_loss = GIoULoss(reduction="sum")

        self.focal_loss_alpha = focal_loss_alpha
        self.focal_loss_gamma = focal_loss_gamma

        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
                                        class_weight, l1_weight, giou_weight)

    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
        """Classification loss (NLL)
        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
        """
        assert 'pred_logits' in outputs
        src_logits = outputs['pred_logits']

        idx = self._get_src_permutation_idx(indices)
        target_classes_o = paddle.concat([
            paddle.gather(
                t["labels"], J, axis=0) for t, (_, J) in zip(targets, indices)
        ])
        target_classes = paddle.full(
            src_logits.shape[:2], self.num_classes, dtype="int32")
        for i, ind in enumerate(zip(idx[0], idx[1])):
            target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i]
        target_classes.stop_gradient = True

        src_logits = src_logits.flatten(start_axis=0, stop_axis=1)

        # prepare one_hot target.
        target_classes = target_classes.flatten(start_axis=0, stop_axis=1)
        class_ids = paddle.arange(0, self.num_classes)
        labels = (target_classes.unsqueeze(-1) == class_ids.astype(target_classes.dtype)).astype("float32")
        labels.stop_gradient = True

        # comp focal loss.
        class_loss = sigmoid_focal_loss(
            src_logits,
            labels,
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum", ) / num_boxes
        losses = {'loss_ce': class_loss}

        if log:
            label_acc = target_classes_o.unsqueeze(-1)
            src_idx = [src for (src, _) in indices]

            pred_list = []
            for i in range(outputs["pred_logits"].shape[0]):
                pred_list.append(
                    paddle.gather(
                        outputs["pred_logits"][i], src_idx[i], axis=0))

            pred = F.sigmoid(paddle.concat(pred_list, axis=0))
            acc = accuracy(pred, label_acc.astype("int64"))
            losses["acc"] = acc

        return losses

    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        assert 'pred_boxes' in outputs  # [batch_size, num_proposals, 4]
        src_idx = [src for (src, _) in indices]
        src_boxes_list = []

        for i in range(outputs["pred_boxes"].shape[0]):
            src_boxes_list.append(
                paddle.gather(
                    outputs["pred_boxes"][i], src_idx[i], axis=0))

        src_boxes = paddle.concat(src_boxes_list, axis=0)

        target_boxes = paddle.concat(
            [
                paddle.gather(
                    t['boxes'], I, axis=0)
                for t, (_, I) in zip(targets, indices)
            ],
            axis=0)
        target_boxes.stop_gradient = True
        losses = {}

        losses['loss_giou'] = self.giou_loss(src_boxes,
                                             target_boxes) / num_boxes

        image_size = paddle.concat([v["img_whwh_tgt"] for v in targets])
        src_boxes_ = src_boxes / image_size
        target_boxes_ = target_boxes / image_size

        loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum')
        losses['loss_bbox'] = loss_bbox / num_boxes

        return losses

    def _get_src_permutation_idx(self, indices):
        # permute predictions following indices
        batch_idx = paddle.concat(
            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
        src_idx = paddle.concat([src for (src, _) in indices])
        return batch_idx, src_idx

    def _get_tgt_permutation_idx(self, indices):
        # permute targets following indices
        batch_idx = paddle.concat(
            [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
        tgt_idx = paddle.concat([tgt for (_, tgt) in indices])
        return batch_idx, tgt_idx

    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
        loss_map = {
            'labels': self.loss_labels,
            'boxes': self.loss_boxes,
        }
        assert loss in loss_map, f'do you really want to compute {loss} loss?'
        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)

    def forward(self, outputs, targets):
        """ This performs the loss computation.
        Parameters:
             outputs: dict of tensors, see the output specification of the model for the format
             targets: list of dicts, such that len(targets) == batch_size.
                      The expected keys in each dict depends on the losses applied, see each loss' doc
        """
        outputs_without_aux = {
            k: v
            for k, v in outputs.items() if k != 'aux_outputs'
        }

        # Retrieve the matching between the outputs of the last layer and the targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the average number of target boxes across all nodes, for normalization purposes
        num_boxes = sum(len(t["labels"]) for t in targets)
        num_boxes = paddle.to_tensor(
            [num_boxes],
            dtype="float32",
            place=next(iter(outputs.values())).place)

        # Compute all the requested losses
        losses = {}
        for loss in self.losses:
            losses.update(
                self.get_loss(loss, outputs, targets, indices, num_boxes))

        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
        if 'aux_outputs' in outputs:
            for i, aux_outputs in enumerate(outputs['aux_outputs']):
                indices = self.matcher(aux_outputs, targets)
                for loss in self.losses:
                    kwargs = {}
                    if loss == 'labels':
                        # Logging is enabled only for the last layer
                        kwargs = {'log': False}
                    l_dict = self.get_loss(loss, aux_outputs, targets, indices,
                                           num_boxes, **kwargs)

                    w_dict = {}
                    for k in l_dict.keys():
                        if k in self.weight_dict:
                            w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[
                                k]
                        else:
                            w_dict[k + f'_{i}'] = l_dict[k]
                    losses.update(w_dict)

        return losses


class HungarianMatcher(nn.Layer):
    """This class computes an assignment between the targets and the predictions of the network
    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
    while the others are un-matched (and thus treated as non-objects).
    """

    def __init__(self,
                 focal_loss_alpha,
                 focal_loss_gamma,
                 cost_class: float=1,
                 cost_bbox: float=1,
                 cost_giou: float=1):
        """Creates the matcher
        Params:
            cost_class: This is the relative weight of the classification error in the matching cost
            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
        """
        super().__init__()
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou
        self.focal_loss_alpha = focal_loss_alpha
        self.focal_loss_gamma = focal_loss_gamma
        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"

    @paddle.no_grad()
    def forward(self, outputs, targets):
        """ Performs the matching
        Args:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
                 eg. outputs = {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
                 eg. targets = [{"labels":labels, "boxes": boxes}, ...,{"labels":labels, "boxes": boxes}]
        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        if sum(len(v["labels"]) for v in targets) == 0:
            return [(paddle.to_tensor(
                [], dtype=paddle.int64), paddle.to_tensor(
                    [], dtype=paddle.int64)) for _ in range(bs)]

        # We flatten to compute the cost matrices in a batch
        out_prob = F.sigmoid(outputs["pred_logits"].flatten(
            start_axis=0, stop_axis=1))
        out_bbox = outputs["pred_boxes"].flatten(start_axis=0, stop_axis=1)

        # Also concat the target labels and boxes
        tgt_ids = paddle.concat([v["labels"] for v in targets])
        assert (tgt_ids > -1).all()
        tgt_bbox = paddle.concat([v["boxes"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.

        # Compute the classification cost.
        alpha = self.focal_loss_alpha
        gamma = self.focal_loss_gamma

        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(
            1 - out_prob + 1e-8).log())
        pos_cost_class = alpha * ((1 - out_prob)
                                  **gamma) * (-(out_prob + 1e-8).log())

        cost_class = paddle.gather(
            pos_cost_class, tgt_ids, axis=1) - paddle.gather(
                neg_cost_class, tgt_ids, axis=1)

        # Compute the L1 cost between boxes
        image_size_out = paddle.concat(
            [v["img_whwh"].unsqueeze(0) for v in targets])
        image_size_out = image_size_out.unsqueeze(1).tile(
            [1, num_queries, 1]).flatten(
                start_axis=0, stop_axis=1)
        image_size_tgt = paddle.concat([v["img_whwh_tgt"] for v in targets])

        out_bbox_ = out_bbox / image_size_out
        tgt_bbox_ = tgt_bbox / image_size_tgt
        cost_bbox = F.l1_loss(
            out_bbox_.unsqueeze(-2), tgt_bbox_,
            reduction='none').sum(-1)  # [batch_size * num_queries, num_tgts]

        # Compute the giou cost betwen boxes
        cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox)

        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.reshape([bs, num_queries, -1])

        sizes = [len(v["boxes"]) for v in targets]

        indices = [
            linear_sum_assignment(c[i].numpy())
            for i, c in enumerate(C.split(sizes, -1))
        ]
        return [(paddle.to_tensor(
            i, dtype="int32"), paddle.to_tensor(
                j, dtype="int32")) for i, j in indices]


def box_area(boxes):
    assert (boxes[:, 2:] >= boxes[:, :2]).all()
    wh = boxes[:, 2:] - boxes[:, :2]
    return wh[:, 0] * wh[:, 1]


def boxes_iou(boxes1, boxes2):
    '''
    Compute iou

    Args:
        boxes1 (paddle.tensor) shape (N, 4)
        boxes2 (paddle.tensor) shape (M, 4)

    Return:
        (paddle.tensor) shape (N, M)
    '''
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
    rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])

    wh = (rb - lt).astype("float32").clip(min=1e-9)
    inter = wh[:, :, 0] * wh[:, :, 1]

    union = area1.unsqueeze(-1) + area2 - inter + 1e-9

    iou = inter / union
    return iou, union


def get_bboxes_giou(boxes1, boxes2, eps=1e-9):
    """calculate the ious of boxes1 and boxes2

    Args:
        boxes1 (Tensor): shape [N, 4]
        boxes2 (Tensor): shape [M, 4]
        eps (float): epsilon to avoid divide by zero

    Return:
        ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M]
    """
    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()

    iou, union = boxes_iou(boxes1, boxes2)

    lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
    rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])

    wh = (rb - lt).astype("float32").clip(min=eps)
    enclose_area = wh[:, :, 0] * wh[:, :, 1]

    giou = iou - (enclose_area - union) / enclose_area

    return giou


def sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction="sum"):

    assert reduction in ["sum", "mean"
                         ], f'do not support this {reduction} reduction?'

    p = F.sigmoid(inputs)
    ce_loss = F.binary_cross_entropy_with_logits(
        inputs, targets, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t)**gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    if reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()

    return loss


================================================
FILE: ppdet/modeling/losses/ssd_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..bbox_utils import iou_similarity, bbox2delta

__all__ = ['SSDLoss']


@register
class SSDLoss(nn.Layer):
    """
    SSDLoss

    Args:
        overlap_threshold (float32, optional): IoU threshold for negative bboxes
            and positive bboxes, 0.5 by default.
        neg_pos_ratio (float): The ratio of negative samples / positive samples.
        loc_loss_weight (float): The weight of loc_loss.
        conf_loss_weight (float): The weight of conf_loss.
        prior_box_var (list): Variances corresponding to prior box coord, [0.1,
            0.1, 0.2, 0.2] by default.
    """

    def __init__(self,
                 overlap_threshold=0.5,
                 neg_pos_ratio=3.0,
                 loc_loss_weight=1.0,
                 conf_loss_weight=1.0,
                 prior_box_var=[0.1, 0.1, 0.2, 0.2]):
        super(SSDLoss, self).__init__()
        self.overlap_threshold = overlap_threshold
        self.neg_pos_ratio = neg_pos_ratio
        self.loc_loss_weight = loc_loss_weight
        self.conf_loss_weight = conf_loss_weight
        self.prior_box_var = [1. / a for a in prior_box_var]

    def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes,
                                   bg_index):
        """
        Args:
            gt_bbox (Tensor): [B, N, 4]
            gt_label (Tensor): [B, N, 1]
            prior_boxes (Tensor): [A, 4]
            bg_index (int): Background class index
        """
        batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0]
        ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape(
            (batch_size, -1, num_priors))

        # For each prior box, get the max IoU of all GTs.
        prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1)
        # For each GT, get the max IoU of all prior boxes.
        gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2)

        # Gather target bbox and label according to 'prior_argmax_iou' index.
        batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1)
        prior_argmax_iou = paddle.stack(
            [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1)
        targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou)
        targets_label = paddle.gather_nd(gt_label, prior_argmax_iou)
        # Assign negative
        bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index,
                                      'int64')
        targets_label = paddle.where(
            prior_max_iou.unsqueeze(-1) < self.overlap_threshold,
            bg_index_tensor, targets_label)

        # Ensure each GT can match the max IoU prior box.
        batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten()
        targets_bbox = paddle.scatter(
            targets_bbox.reshape([-1, 4]), batch_ind,
            gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4])
        targets_label = paddle.scatter(
            targets_label.reshape([-1, 1]), batch_ind,
            gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1])
        targets_label[:, :1] = bg_index

        # Encode box
        prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1])
        targets_bbox = bbox2delta(
            prior_boxes.reshape([-1, 4]),
            targets_bbox.reshape([-1, 4]), self.prior_box_var)
        targets_bbox = targets_bbox.reshape([batch_size, -1, 4])

        return targets_bbox, targets_label

    def _mine_hard_example(self,
                           conf_loss,
                           targets_label,
                           bg_index,
                           mine_neg_ratio=0.01):
        pos = (targets_label != bg_index).astype(conf_loss.dtype)
        num_pos = pos.sum(axis=1, keepdim=True)
        neg = (targets_label == bg_index).astype(conf_loss.dtype)

        conf_loss = conf_loss.detach() * neg
        loss_idx = conf_loss.argsort(axis=1, descending=True)
        idx_rank = loss_idx.argsort(axis=1)
        num_negs = []
        for i in range(conf_loss.shape[0]):
            cur_num_pos = num_pos[i]
            num_neg = paddle.clip(
                cur_num_pos * self.neg_pos_ratio, max=pos.shape[1])
            num_neg = num_neg if num_neg > 0 else paddle.to_tensor(
                [pos.shape[1] * mine_neg_ratio])
            num_negs.append(num_neg)
        num_negs = paddle.stack(num_negs).expand_as(idx_rank)
        neg_mask = (idx_rank.astype(num_negs.dtype) < num_negs).astype(conf_loss.dtype)

        return (neg_mask + pos).astype('bool')

    def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes):
        boxes = paddle.concat(boxes, axis=1)
        scores = paddle.concat(scores, axis=1)
        gt_label = gt_label.unsqueeze(-1).astype('int64')
        prior_boxes = paddle.concat(prior_boxes, axis=0)
        bg_index = scores.shape[-1] - 1

        # Match bbox and get targets.
        targets_bbox, targets_label = \
            self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index)
        targets_bbox.stop_gradient = True
        targets_label.stop_gradient = True

        # Compute regression loss.
        # Select positive samples.
        bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4])
        if bbox_mask.astype(boxes.dtype).sum() > 0:
            location = paddle.masked_select(boxes, bbox_mask)
            targets_bbox_tmp = paddle.masked_select(targets_bbox, bbox_mask)
            loc_loss = F.smooth_l1_loss(location, targets_bbox_tmp, reduction='sum')
            loc_loss = loc_loss * self.loc_loss_weight
        else:
            loc_loss = paddle.zeros([])

        # Compute confidence loss.
        conf_loss = F.cross_entropy(scores, targets_label, reduction="none")
        # Mining hard examples.
        label_mask = self._mine_hard_example(
            conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index)
        conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1))
        conf_loss = conf_loss.sum() * self.conf_loss_weight

        # Compute overall weighted loss.
        normalizer = (targets_label != bg_index).astype('float32').sum().clip(
            min=1)
        loss = (conf_loss + loc_loss) / normalizer

        return loss


================================================
FILE: ppdet/modeling/losses/supcontrast.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

import random
from ppdet.core.workspace import register


__all__ = ['SupContrast']


@register
class SupContrast(nn.Layer):
    __shared__ = [
        'num_classes'
    ]
    def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75):
        super(SupContrast, self).__init__()
        self.num_classes = num_classes
        self.temperature = temperature
        self.sample_num = sample_num
        self.thresh = thresh
    def forward(self, features, labels, scores):
        
        assert features.shape[0] == labels.shape[0] == scores.shape[0]
        positive_mask = (labels < self.num_classes)
        positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \
                                                              scores[positive_mask]
        
        negative_mask = (labels == self.num_classes)
        negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \
                                                              scores[negative_mask]
        
        N = negative_features.shape[0]
        S = self.sample_num - positive_mask.sum()   
        index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32')

        negative_features = paddle.index_select(x=negative_features, index=index, axis=0)
        negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0)
        negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0)
        
        features = paddle.concat([positive_features, negative_features], 0)
        labels = paddle.concat([positive_labels, negative_labels], 0)
        scores = paddle.concat([positive_scores, negative_scores], 0)

        if len(labels.shape) == 1:
            labels = labels.reshape([-1, 1])
        label_mask = paddle.equal(labels, labels.T).detach()
        similarity = (paddle.matmul(features, features.T) / self.temperature)

        sim_row_max = paddle.max(similarity, axis=1, keepdim=True)
        similarity = similarity - sim_row_max

        logits_mask = paddle.ones_like(similarity).detach()
        logits_mask.fill_diagonal_(0)

        exp_sim = paddle.exp(similarity) * logits_mask
        log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True))

        per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1)
        keep = scores > self.thresh
        per_label_log_prob = per_label_log_prob[keep]
        loss = -per_label_log_prob

        return loss.mean()

================================================
FILE: ppdet/modeling/losses/varifocal_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling import ops
# from paddle.base.framework import in_dygraph_mode
__all__ = ['VarifocalLoss']


def varifocal_loss(pred,
                   target,
                   alpha=0.75,
                   gamma=2.0,
                   iou_weighted=True,
                   use_sigmoid=True):
    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_

    Args:
        pred (Tensor): The prediction with shape (N, C), C is the
            number of classes
        target (Tensor): The learning target of the iou-aware
            classification score with shape (N, C), C is the number of classes.
        alpha (float, optional): A balance factor for the negative part of
            Varifocal Loss, which is different from the alpha of Focal Loss.
            Defaults to 0.75.
        gamma (float, optional): The gamma for calculating the modulating
            factor. Defaults to 2.0.
        iou_weighted (bool, optional): Whether to weight the loss of the
            positive example with the iou target. Defaults to True.
    """
    # pred and target should be of the same size
    assert len(pred.shape) == len(target.shape) # rank
    # if in_dygraph_mode():
    #     assert pred.shape == target.shape
    if use_sigmoid:
        pred_new = F.sigmoid(pred)
    else:
        pred_new = pred
    target = target.cast(pred.dtype)
    if iou_weighted:
        focal_weight = target * (target > 0.0).cast('float32') + \
            alpha * (pred_new - target).abs().pow(gamma) * \
            (target <= 0.0).cast('float32')
    else:
        focal_weight = (target > 0.0).cast('float32') + \
            alpha * (pred_new - target).abs().pow(gamma) * \
            (target <= 0.0).cast('float32')

    if use_sigmoid:
        loss = F.binary_cross_entropy_with_logits(
            pred, target, reduction='none') * focal_weight
    else:
        loss = F.binary_cross_entropy(
            pred, target, reduction='none') * focal_weight
        loss = loss.sum(axis=1)
    return loss


@register
@serializable
class VarifocalLoss(nn.Layer):
    def __init__(self,
                 use_sigmoid=True,
                 alpha=0.75,
                 gamma=2.0,
                 iou_weighted=True,
                 reduction='mean',
                 loss_weight=1.0):
        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_

        Args:
            use_sigmoid (bool, optional): Whether the prediction is
                used for sigmoid or softmax. Defaults to True.
            alpha (float, optional): A balance factor for the negative part of
                Varifocal Loss, which is different from the alpha of Focal
                Loss. Defaults to 0.75.
            gamma (float, optional): The gamma for calculating the modulating
                factor. Defaults to 2.0.
            iou_weighted (bool, optional): Whether to weight the loss of the
                positive examples with the iou target. Defaults to True.
            reduction (str, optional): The method used to reduce the loss into
                a scalar. Defaults to 'mean'. Options are "none", "mean" and
                "sum".
            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
        """
        super(VarifocalLoss, self).__init__()
        assert alpha >= 0.0
        self.use_sigmoid = use_sigmoid
        self.alpha = alpha
        self.gamma = gamma
        self.iou_weighted = iou_weighted
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self, pred, target, weight=None, avg_factor=None):
        """Forward function.

        Args:
            pred (Tensor): The prediction.
            target (Tensor): The learning target of the prediction.
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
        Returns:
            Tensor: The calculated loss
        """
        loss = self.loss_weight * varifocal_loss(
            pred,
            target,
            alpha=self.alpha,
            gamma=self.gamma,
            iou_weighted=self.iou_weighted,
            use_sigmoid=self.use_sigmoid)

        if weight is not None:
            loss = loss * weight
        if avg_factor is None:
            if self.reduction == 'none':
                return loss
            elif self.reduction == 'mean':
                return loss.mean()
            elif self.reduction == 'sum':
                return loss.sum()
        else:
            # if reduction is mean, then average the loss by avg_factor
            if self.reduction == 'mean':
                loss = loss.sum() / avg_factor
            # if reduction is 'none', then do nothing, otherwise raise an error
            elif self.reduction != 'none':
                raise ValueError(
                    'avg_factor can not be used with reduction="sum"')
        return loss


================================================
FILE: ppdet/modeling/losses/yolo_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register

from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity

__all__ = ['YOLOv3Loss']


def bbox_transform(pbox, anchor, downsample):
    pbox = decode_yolo(pbox, anchor, downsample)
    pbox = xywh2xyxy(pbox)
    return pbox


@register
class YOLOv3Loss(nn.Layer):

    __inject__ = ['iou_loss', 'iou_aware_loss']
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=80,
                 ignore_thresh=0.7,
                 label_smooth=False,
                 downsample=[32, 16, 8],
                 scale_x_y=1.,
                 iou_loss=None,
                 iou_aware_loss=None):
        """
        YOLOv3Loss layer

        Args:
            num_calsses (int): number of foreground classes
            ignore_thresh (float): threshold to ignore confidence loss
            label_smooth (bool): whether to use label smoothing
            downsample (list): downsample ratio for each detection block
            scale_x_y (float): scale_x_y factor
            iou_loss (object): IoULoss instance
            iou_aware_loss (object): IouAwareLoss instance  
        """
        super(YOLOv3Loss, self).__init__()
        self.num_classes = num_classes
        self.ignore_thresh = ignore_thresh
        self.label_smooth = label_smooth
        self.downsample = downsample
        self.scale_x_y = scale_x_y
        self.iou_loss = iou_loss
        self.iou_aware_loss = iou_aware_loss
        self.distill_pairs = []

    def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample):
        # pbox
        pbox = decode_yolo(pbox, anchor, downsample)
        pbox = xywh2xyxy(pbox)
        pbox = paddle.concat(pbox, axis=-1)
        b = pbox.shape[0]
        pbox = pbox.reshape((b, -1, 4))
        # gbox
        gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5
        gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5
        gbox = paddle.concat([gxy, gwh], axis=-1)

        iou = batch_iou_similarity(pbox, gbox)
        iou.stop_gradient = True
        iou_max = iou.max(2)  # [N, M1]
        iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype)
        iou_mask.stop_gradient = True

        pobj = pobj.reshape((b, -1))
        tobj = tobj.reshape((b, -1))
        obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype)
        obj_mask.stop_gradient = True

        loss_obj = F.binary_cross_entropy_with_logits(
            pobj, obj_mask, reduction='none')
        loss_obj_pos = (loss_obj * tobj)
        loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask)
        return loss_obj_pos + loss_obj_neg

    def cls_loss(self, pcls, tcls):
        if self.label_smooth:
            delta = min(1. / self.num_classes, 1. / 40)
            pos, neg = 1 - delta, delta
            # 1 for positive, 0 for negative
            tcls = pos * paddle.cast(
                tcls > 0., dtype=tcls.dtype) + neg * paddle.cast(
                    tcls <= 0., dtype=tcls.dtype)

        loss_cls = F.binary_cross_entropy_with_logits(
            pcls, tcls, reduction='none')
        return loss_cls

    def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1.,
                    eps=1e-10):
        na = len(anchor)
        b, c, h, w = p.shape
        if self.iou_aware_loss:
            ioup, p = p[:, 0:na, :, :], p[:, na:, :, :]
            ioup = ioup.unsqueeze(-1)
        p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2))
        x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2]
        w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4]
        obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:]
        self.distill_pairs.append([x, y, w, h, obj, pcls])

        t = t.transpose((0, 1, 3, 4, 2))
        tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2]
        tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4]
        tscale = t[:, :, :, :, 4:5]
        tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:]

        tscale_obj = tscale * tobj
        loss = dict()

        x = scale * F.sigmoid(x) - 0.5 * (scale - 1.)
        y = scale * F.sigmoid(y) - 0.5 * (scale - 1.)

        if abs(scale - 1.) < eps:
            loss_x = F.binary_cross_entropy(x, tx, reduction='none')
            loss_y = F.binary_cross_entropy(y, ty, reduction='none')
            loss_xy = tscale_obj * (loss_x + loss_y)
        else:
            loss_x = paddle.abs(x - tx)
            loss_y = paddle.abs(y - ty)
            loss_xy = tscale_obj * (loss_x + loss_y)

        loss_xy = loss_xy.sum([1, 2, 3, 4]).mean()

        loss_w = paddle.abs(w - tw)
        loss_h = paddle.abs(h - th)
        loss_wh = tscale_obj * (loss_w + loss_h)
        loss_wh = loss_wh.sum([1, 2, 3, 4]).mean()

        loss['loss_xy'] = loss_xy
        loss['loss_wh'] = loss_wh

        if self.iou_loss is not None:
            # warn: do not modify x, y, w, h in place
            box, tbox = [x, y, w, h], [tx, ty, tw, th]
            pbox = bbox_transform(box, anchor, downsample)
            gbox = bbox_transform(tbox, anchor, downsample)
            loss_iou = self.iou_loss(pbox, gbox)
            loss_iou = loss_iou * tscale_obj
            loss_iou = loss_iou.sum([1, 2, 3, 4]).mean()
            loss['loss_iou'] = loss_iou

        if self.iou_aware_loss is not None:
            box, tbox = [x, y, w, h], [tx, ty, tw, th]
            pbox = bbox_transform(box, anchor, downsample)
            gbox = bbox_transform(tbox, anchor, downsample)
            loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox)
            loss_iou_aware = loss_iou_aware * tobj
            loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean()
            loss['loss_iou_aware'] = loss_iou_aware

        box = [x, y, w, h]
        loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample)
        loss_obj = loss_obj.sum(-1).mean()
        loss['loss_obj'] = loss_obj
        loss_cls = self.cls_loss(pcls, tcls) * tobj
        loss_cls = loss_cls.sum([1, 2, 3, 4]).mean()
        loss['loss_cls'] = loss_cls
        return loss

    def forward(self, inputs, targets, anchors):
        np = len(inputs)
        gt_targets = [targets['target{}'.format(i)] for i in range(np)]
        gt_box = targets['gt_bbox']
        yolo_losses = dict()
        self.distill_pairs.clear()
        for x, t, anchor, downsample in zip(inputs, gt_targets, anchors,
                                            self.downsample):
            yolo_loss = self.yolov3_loss(
                x.astype('float32'), t, gt_box, anchor, downsample,
                self.scale_x_y)
            for k, v in yolo_loss.items():
                if k in yolo_losses:
                    yolo_losses[k] += v
                else:
                    yolo_losses[k] = v

        loss = 0
        for k, v in yolo_losses.items():
            loss += v

        yolo_losses['loss'] = loss
        return yolo_losses


================================================
FILE: ppdet/modeling/mot/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import matching
from . import tracker
from . import motion
from . import visualization
from . import utils

from .matching import *
from .tracker import *
from .motion import *
from .visualization import *
from .utils import *


================================================
FILE: ppdet/modeling/mot/matching/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import jde_matching
from . import deepsort_matching
from . import ocsort_matching

from .jde_matching import *
from .deepsort_matching import *
from .ocsort_matching import *


================================================
FILE: ppdet/modeling/mot/matching/deepsort_matching.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/nwojke/deep_sort/tree/master/deep_sort
"""

import numpy as np
from scipy.optimize import linear_sum_assignment
from ..motion import kalman_filter

INFTY_COST = 1e+5

__all__ = [
    'iou_1toN',
    'iou_cost',
    '_nn_euclidean_distance',
    '_nn_cosine_distance',
    'NearestNeighborDistanceMetric',
    'min_cost_matching',
    'matching_cascade',
    'gate_cost_matrix',
]


def iou_1toN(bbox, candidates):
    """
    Computer intersection over union (IoU) by one box to N candidates.

    Args:
        bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`.
            candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the
            same format as `bbox`.

    Returns:
        ious (ndarray): The intersection over union in [0, 1] between the `bbox`
            and each candidate. A higher score means a larger fraction of the
            `bbox` is occluded by the candidate.
    """
    bbox_tl = bbox[:2]
    bbox_br = bbox[:2] + bbox[2:]
    candidates_tl = candidates[:, :2]
    candidates_br = candidates[:, :2] + candidates[:, 2:]

    tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis],
               np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]]
    br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis],
               np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]]
    wh = np.maximum(0., br - tl)

    area_intersection = wh.prod(axis=1)
    area_bbox = bbox[2:].prod()
    area_candidates = candidates[:, 2:].prod(axis=1)
    ious = area_intersection / (area_bbox + area_candidates - area_intersection)
    return ious


def iou_cost(tracks, detections, track_indices=None, detection_indices=None):
    """
    IoU distance metric.

    Args:
        tracks (list[Track]): A list of tracks.
        detections (list[Detection]): A list of detections.
        track_indices (Optional[list[int]]): A list of indices to tracks that
            should be matched. Defaults to all `tracks`.
        detection_indices (Optional[list[int]]): A list of indices to detections
            that should be matched. Defaults to all `detections`.

    Returns:
        cost_matrix (ndarray): A cost matrix of shape len(track_indices), 
            len(detection_indices) where entry (i, j) is 
            `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`.
    """
    if track_indices is None:
        track_indices = np.arange(len(tracks))
    if detection_indices is None:
        detection_indices = np.arange(len(detections))

    cost_matrix = np.zeros((len(track_indices), len(detection_indices)))
    for row, track_idx in enumerate(track_indices):
        if tracks[track_idx].time_since_update > 1:
            cost_matrix[row, :] = 1e+5
            continue

        bbox = tracks[track_idx].to_tlwh()
        candidates = np.asarray([detections[i].tlwh for i in detection_indices])
        cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates)
    return cost_matrix


def _nn_euclidean_distance(s, q):
    """
    Compute pair-wise squared (Euclidean) distance between points in `s` and `q`.

    Args:
        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.

    Returns:
        distances (ndarray): A vector of length M that contains for each entry in `q` the
            smallest Euclidean distance to a sample in `s`.
    """
    s, q = np.asarray(s), np.asarray(q)
    if len(s) == 0 or len(q) == 0:
        return np.zeros((len(s), len(q)))
    s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1)
    distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :]
    distances = np.clip(distances, 0., float(np.inf))

    return np.maximum(0.0, distances.min(axis=0))


def _nn_cosine_distance(s, q):
    """
    Compute pair-wise cosine distance between points in `s` and `q`.

    Args:
        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.

    Returns:
        distances (ndarray): A vector of length M that contains for each entry in `q` the
            smallest Euclidean distance to a sample in `s`.
    """
    s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True)
    q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True)
    distances = 1. - np.dot(s, q.T)

    return distances.min(axis=0)


class NearestNeighborDistanceMetric(object):
    """
    A nearest neighbor distance metric that, for each target, returns
    the closest distance to any sample that has been observed so far.

    Args:
        metric (str): Either "euclidean" or "cosine".
        matching_threshold (float): The matching threshold. Samples with larger
            distance are considered an invalid match.
        budget (Optional[int]): If not None, fix samples per class to at most
            this number. Removes the oldest samples when the budget is reached.

    Attributes: 
        samples (Dict[int -> List[ndarray]]): A dictionary that maps from target
            identities to the list of samples that have been observed so far.
    """

    def __init__(self, metric, matching_threshold, budget=None):
        if metric == "euclidean":
            self._metric = _nn_euclidean_distance
        elif metric == "cosine":
            self._metric = _nn_cosine_distance
        else:
            raise ValueError(
                "Invalid metric; must be either 'euclidean' or 'cosine'")
        self.matching_threshold = matching_threshold
        self.budget = budget
        self.samples = {}

    def partial_fit(self, features, targets, active_targets):
        """
        Update the distance metric with new data.

        Args:
            features (ndarray): An NxM matrix of N features of dimensionality M.
            targets (ndarray): An integer array of associated target identities.
            active_targets (List[int]): A list of targets that are currently
                present in the scene.
        """
        for feature, target in zip(features, targets):
            self.samples.setdefault(target, []).append(feature)
            if self.budget is not None:
                self.samples[target] = self.samples[target][-self.budget:]
        self.samples = {k: self.samples[k] for k in active_targets}

    def distance(self, features, targets):
        """
        Compute distance between features and targets.

        Args:
            features (ndarray): An NxM matrix of N features of dimensionality M.
            targets (list[int]): A list of targets to match the given `features` against.

        Returns:
            cost_matrix (ndarray): a cost matrix of shape len(targets), len(features),
                where element (i, j) contains the closest squared distance between
                `targets[i]` and `features[j]`.
        """
        cost_matrix = np.zeros((len(targets), len(features)))
        for i, target in enumerate(targets):
            cost_matrix[i, :] = self._metric(self.samples[target], features)
        return cost_matrix


def min_cost_matching(distance_metric,
                      max_distance,
                      tracks,
                      detections,
                      track_indices=None,
                      detection_indices=None):
    """
    Solve linear assignment problem.

    Args:
        distance_metric :
            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
            The distance metric is given a list of tracks and detections as 
            well as a list of N track indices and M detection indices. The 
            metric should return the NxM dimensional cost matrix, where element
            (i, j) is the association cost between the i-th track in the given
            track indices and the j-th detection in the given detection_indices.
        max_distance (float): Gating threshold. Associations with cost larger
            than this value are disregarded.
        tracks (list[Track]): A list of predicted tracks at the current time
            step.
        detections (list[Detection]): A list of detections at the current time
            step.
        track_indices (list[int]): List of track indices that maps rows in
            `cost_matrix` to tracks in `tracks`.
        detection_indices (List[int]): List of detection indices that maps
            columns in `cost_matrix` to detections in `detections`.

    Returns:
        A tuple (List[(int, int)], List[int], List[int]) with the following
        three entries:
            * A list of matched track and detection indices.
            * A list of unmatched track indices.
            * A list of unmatched detection indices.
    """
    if track_indices is None:
        track_indices = np.arange(len(tracks))
    if detection_indices is None:
        detection_indices = np.arange(len(detections))

    if len(detection_indices) == 0 or len(track_indices) == 0:
        return [], track_indices, detection_indices  # Nothing to match.

    cost_matrix = distance_metric(tracks, detections, track_indices,
                                  detection_indices)

    cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5
    indices = linear_sum_assignment(cost_matrix)

    matches, unmatched_tracks, unmatched_detections = [], [], []
    for col, detection_idx in enumerate(detection_indices):
        if col not in indices[1]:
            unmatched_detections.append(detection_idx)
    for row, track_idx in enumerate(track_indices):
        if row not in indices[0]:
            unmatched_tracks.append(track_idx)
    for row, col in zip(indices[0], indices[1]):
        track_idx = track_indices[row]
        detection_idx = detection_indices[col]
        if cost_matrix[row, col] > max_distance:
            unmatched_tracks.append(track_idx)
            unmatched_detections.append(detection_idx)
        else:
            matches.append((track_idx, detection_idx))
    return matches, unmatched_tracks, unmatched_detections


def matching_cascade(distance_metric,
                     max_distance,
                     cascade_depth,
                     tracks,
                     detections,
                     track_indices=None,
                     detection_indices=None):
    """
    Run matching cascade.

    Args:
        distance_metric :
            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
            The distance metric is given a list of tracks and detections as 
            well as a list of N track indices and M detection indices. The 
            metric should return the NxM dimensional cost matrix, where element
            (i, j) is the association cost between the i-th track in the given
            track indices and the j-th detection in the given detection_indices.
        max_distance (float): Gating threshold. Associations with cost larger
            than this value are disregarded.
        cascade_depth (int): The cascade depth, should be se to the maximum
            track age.
        tracks (list[Track]): A list of predicted tracks at the current time
            step.
        detections (list[Detection]): A list of detections at the current time
            step.
        track_indices (list[int]): List of track indices that maps rows in
            `cost_matrix` to tracks in `tracks`.
        detection_indices (List[int]): List of detection indices that maps
            columns in `cost_matrix` to detections in `detections`.

    Returns:
        A tuple (List[(int, int)], List[int], List[int]) with the following
        three entries:
            * A list of matched track and detection indices.
            * A list of unmatched track indices.
            * A list of unmatched detection indices.
    """
    if track_indices is None:
        track_indices = list(range(len(tracks)))
    if detection_indices is None:
        detection_indices = list(range(len(detections)))

    unmatched_detections = detection_indices
    matches = []
    for level in range(cascade_depth):
        if len(unmatched_detections) == 0:  # No detections left
            break

        track_indices_l = [
            k for k in track_indices if tracks[k].time_since_update == 1 + level
        ]
        if len(track_indices_l) == 0:  # Nothing to match at this level
            continue

        matches_l, _, unmatched_detections = \
            min_cost_matching(
                distance_metric, max_distance, tracks, detections,
                track_indices_l, unmatched_detections)
        matches += matches_l
    unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches))
    return matches, unmatched_tracks, unmatched_detections


def gate_cost_matrix(kf,
                     cost_matrix,
                     tracks,
                     detections,
                     track_indices,
                     detection_indices,
                     gated_cost=INFTY_COST,
                     only_position=False):
    """
    Invalidate infeasible entries in cost matrix based on the state
    distributions obtained by Kalman filtering.

    Args:
        kf (object): The Kalman filter.
        cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the
            number of track indices and M is the number of detection indices,
            such that entry (i, j) is the association cost between
            `tracks[track_indices[i]]` and `detections[detection_indices[j]]`.
        tracks (list[Track]): A list of predicted tracks at the current time
            step.
        detections (list[Detection]): A list of detections at the current time
            step.
        track_indices (List[int]): List of track indices that maps rows in
            `cost_matrix` to tracks in `tracks`.
        detection_indices (List[int]): List of detection indices that maps
            columns in `cost_matrix` to detections in `detections`.
        gated_cost (Optional[float]): Entries in the cost matrix corresponding
            to infeasible associations are set this value. Defaults to a very
            large value.
        only_position (Optional[bool]): If True, only the x, y position of the
            state distribution is considered during gating. Default False.
    """
    gating_dim = 2 if only_position else 4
    gating_threshold = kalman_filter.chi2inv95[gating_dim]
    measurements = np.asarray(
        [detections[i].to_xyah() for i in detection_indices])
    for row, track_idx in enumerate(track_indices):
        track = tracks[track_idx]
        gating_distance = kf.gating_distance(track.mean, track.covariance,
                                             measurements, only_position)
        cost_matrix[row, gating_distance > gating_threshold] = gated_cost
    return cost_matrix


================================================
FILE: ppdet/modeling/mot/matching/jde_matching.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py
"""

try:
    import lap
except:
    print(
        'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'
    )
    pass

import scipy
import numpy as np
from scipy.spatial.distance import cdist
from ..motion import kalman_filter
import warnings
warnings.filterwarnings("ignore")

__all__ = [
    'merge_matches',
    'linear_assignment',
    'bbox_ious',
    'iou_distance',
    'embedding_distance',
    'fuse_motion',
]


def merge_matches(m1, m2, shape):
    O, P, Q = shape
    m1 = np.asarray(m1)
    m2 = np.asarray(m2)

    M1 = scipy.sparse.coo_matrix(
        (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
    M2 = scipy.sparse.coo_matrix(
        (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))

    mask = M1 * M2
    match = mask.nonzero()
    match = list(zip(match[0], match[1]))
    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))

    return match, unmatched_O, unmatched_Q


def linear_assignment(cost_matrix, thresh):
    try:
        import lap
    except Exception as e:
        raise RuntimeError(
            'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'
        )
    if cost_matrix.size == 0:
        return np.empty(
            (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(
                range(cost_matrix.shape[1]))
    matches, unmatched_a, unmatched_b = [], [], []
    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
    for ix, mx in enumerate(x):
        if mx >= 0:
            matches.append([ix, mx])
    unmatched_a = np.where(x < 0)[0]
    unmatched_b = np.where(y < 0)[0]
    matches = np.asarray(matches)
    return matches, unmatched_a, unmatched_b


def bbox_ious(atlbrs, btlbrs):
    boxes = np.ascontiguousarray(atlbrs, dtype=np.float32)
    query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    ious = np.zeros((N, K), dtype=boxes.dtype)
    if N * K == 0:
        return ious

    for k in range(K):
        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) *
                    (query_boxes[k, 3] - query_boxes[k, 1] + 1))
        for n in range(N):
            iw = (min(boxes[n, 2], query_boxes[k, 2]) - max(
                boxes[n, 0], query_boxes[k, 0]) + 1)
            if iw > 0:
                ih = (min(boxes[n, 3], query_boxes[k, 3]) - max(
                    boxes[n, 1], query_boxes[k, 1]) + 1)
                if ih > 0:
                    ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[
                        n, 3] - boxes[n, 1] + 1) + box_area - iw * ih)
                    ious[n, k] = iw * ih / ua
    return ious


def iou_distance(atracks, btracks):
    """
    Compute cost based on IoU between two list[STrack].
    """
    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or (
            len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
        atlbrs = atracks
        btlbrs = btracks
    else:
        atlbrs = [track.tlbr for track in atracks]
        btlbrs = [track.tlbr for track in btracks]
    _ious = bbox_ious(atlbrs, btlbrs)
    cost_matrix = 1 - _ious

    return cost_matrix


def embedding_distance(tracks, detections, metric='euclidean'):
    """
    Compute cost based on features between two list[STrack].
    """
    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features = np.asarray(
        [track.curr_feat for track in detections], dtype=np.float32)
    track_features = np.asarray(
        [track.smooth_feat for track in tracks], dtype=np.float32)
    cost_matrix = np.maximum(0.0, cdist(track_features, det_features,
                                        metric))  # Nomalized features
    return cost_matrix


def fuse_motion(kf,
                cost_matrix,
                tracks,
                detections,
                only_position=False,
                lambda_=0.98):
    if cost_matrix.size == 0:
        return cost_matrix
    gating_dim = 2 if only_position else 4
    gating_threshold = kalman_filter.chi2inv95[gating_dim]
    measurements = np.asarray([det.to_xyah() for det in detections])
    for row, track in enumerate(tracks):
        gating_distance = kf.gating_distance(
            track.mean,
            track.covariance,
            measurements,
            only_position,
            metric='maha')
        cost_matrix[row, gating_distance > gating_threshold] = np.inf
        cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_
                                                         ) * gating_distance
    return cost_matrix


================================================
FILE: ppdet/modeling/mot/matching/ocsort_matching.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py
"""

import os
import numpy as np


def iou_batch(bboxes1, bboxes2):
    bboxes2 = np.expand_dims(bboxes2, 0)
    bboxes1 = np.expand_dims(bboxes1, 1)

    xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
    yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
    xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
    yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
    w = np.maximum(0., xx2 - xx1)
    h = np.maximum(0., yy2 - yy1)
    area = w * h
    iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) *
                         (bboxes1[..., 3] - bboxes1[..., 1]) +
                         (bboxes2[..., 2] - bboxes2[..., 0]) *
                         (bboxes2[..., 3] - bboxes2[..., 1]) - area)
    return iou_matrix


def speed_direction_batch(dets, tracks):
    tracks = tracks[..., np.newaxis]
    CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0
    CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, (
        tracks[:, 1] + tracks[:, 3]) / 2.0
    dx = CX1 - CX2
    dy = CY1 - CY2
    norm = np.sqrt(dx**2 + dy**2) + 1e-6
    dx = dx / norm
    dy = dy / norm
    return dy, dx


def linear_assignment(cost_matrix):
    try:
        import lap
        _, x, y = lap.lapjv(cost_matrix, extend_cost=True)
        return np.array([[y[i], i] for i in x if i >= 0])
    except ImportError:
        from scipy.optimize import linear_sum_assignment
        x, y = linear_sum_assignment(cost_matrix)
        return np.array(list(zip(x, y)))


def associate(detections, trackers, iou_threshold, velocities, previous_obs,
              vdc_weight):
    if (len(trackers) == 0):
        return np.empty(
            (0, 2), dtype=int), np.arange(len(detections)), np.empty(
                (0, 5), dtype=int)

    Y, X = speed_direction_batch(detections, previous_obs)
    inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1]
    inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1)
    inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1)
    diff_angle_cos = inertia_X * X + inertia_Y * Y
    diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1)
    diff_angle = np.arccos(diff_angle_cos)
    diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi

    valid_mask = np.ones(previous_obs.shape[0])
    valid_mask[np.where(previous_obs[:, 4] < 0)] = 0

    iou_matrix = iou_batch(detections, trackers)
    scores = np.repeat(
        detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1)
    # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this
    valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1)

    angle_diff_cost = (valid_mask * diff_angle) * vdc_weight
    angle_diff_cost = angle_diff_cost.T
    angle_diff_cost = angle_diff_cost * scores

    if min(iou_matrix.shape) > 0:
        a = (iou_matrix > iou_threshold).astype(np.int32)
        if a.sum(1).max() == 1 and a.sum(0).max() == 1:
            matched_indices = np.stack(np.where(a), axis=1)
        else:
            matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost))
    else:
        matched_indices = np.empty(shape=(0, 2))

    unmatched_detections = []
    for d, det in enumerate(detections):
        if (d not in matched_indices[:, 0]):
            unmatched_detections.append(d)
    unmatched_trackers = []
    for t, trk in enumerate(trackers):
        if (t not in matched_indices[:, 1]):
            unmatched_trackers.append(t)

    # filter out matched with low IOU
    matches = []
    for m in matched_indices:
        if (iou_matrix[m[0], m[1]] < iou_threshold):
            unmatched_detections.append(m[0])
            unmatched_trackers.append(m[1])
        else:
            matches.append(m.reshape(1, 2))
    if (len(matches) == 0):
        matches = np.empty((0, 2), dtype=int)
    else:
        matches = np.concatenate(matches, axis=0)

    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)


def associate_only_iou(detections, trackers, iou_threshold):
    if (len(trackers) == 0):
        return np.empty(
            (0, 2), dtype=int), np.arange(len(detections)), np.empty(
                (0, 5), dtype=int)

    iou_matrix = iou_batch(detections, trackers)

    if min(iou_matrix.shape) > 0:
        a = (iou_matrix > iou_threshold).astype(np.int32)
        if a.sum(1).max() == 1 and a.sum(0).max() == 1:
            matched_indices = np.stack(np.where(a), axis=1)
        else:
            matched_indices = linear_assignment(-iou_matrix)
    else:
        matched_indices = np.empty(shape=(0, 2))

    unmatched_detections = []
    for d, det in enumerate(detections):
        if (d not in matched_indices[:, 0]):
            unmatched_detections.append(d)
    unmatched_trackers = []
    for t, trk in enumerate(trackers):
        if (t not in matched_indices[:, 1]):
            unmatched_trackers.append(t)

    # filter out matched with low IOU
    matches = []
    for m in matched_indices:
        if (iou_matrix[m[0], m[1]] < iou_threshold):
            unmatched_detections.append(m[0])
            unmatched_trackers.append(m[1])
        else:
            matches.append(m.reshape(1, 2))
    if (len(matches) == 0):
        matches = np.empty((0, 2), dtype=int)
    else:
        matches = np.concatenate(matches, axis=0)
    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)


================================================
FILE: ppdet/modeling/mot/motion/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import kalman_filter

from .kalman_filter import *
from .gmc import *

================================================
FILE: ppdet/modeling/mot/motion/gmc.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/gmc.py
"""

import cv2
import matplotlib.pyplot as plt
import numpy as np
import copy
import time
from ppdet.core.workspace import register, serializable


@register
@serializable
class GMC:
    def __init__(self, method='sparseOptFlow', downscale=2, verbose=None):
        super(GMC, self).__init__()

        self.method = method
        self.downscale = max(1, int(downscale))

        if self.method == 'orb':
            self.detector = cv2.FastFeatureDetector_create(20)
            self.extractor = cv2.ORB_create()
            self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)

        elif self.method == 'sift':
            self.detector = cv2.SIFT_create(
                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
            self.extractor = cv2.SIFT_create(
                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
            self.matcher = cv2.BFMatcher(cv2.NORM_L2)

        elif self.method == 'ecc':
            number_of_iterations = 5000
            termination_eps = 1e-6
            self.warp_mode = cv2.MOTION_EUCLIDEAN
            self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
                             number_of_iterations, termination_eps)

        elif self.method == 'sparseOptFlow':
            self.feature_params = dict(
                maxCorners=1000,
                qualityLevel=0.01,
                minDistance=1,
                blockSize=3,
                useHarrisDetector=False,
                k=0.04)
            # self.gmc_file = open('GMC_results.txt', 'w')

        elif self.method == 'file' or self.method == 'files':
            seqName = verbose[0]
            ablation = verbose[1]
            if ablation:
                filePath = r'tracker/GMC_files/MOT17_ablation'
            else:
                filePath = r'tracker/GMC_files/MOTChallenge'

            if '-FRCNN' in seqName:
                seqName = seqName[:-6]
            elif '-DPM' in seqName:
                seqName = seqName[:-4]
            elif '-SDP' in seqName:
                seqName = seqName[:-4]

            self.gmcFile = open(filePath + "/GMC-" + seqName + ".txt", 'r')

            if self.gmcFile is None:
                raise ValueError("Error: Unable to open GMC file in directory:"
                                 + filePath)
        elif self.method == 'none' or self.method == 'None':
            self.method = 'none'
        else:
            raise ValueError("Error: Unknown CMC method:" + method)

        self.prevFrame = None
        self.prevKeyPoints = None
        self.prevDescriptors = None

        self.initializedFirstFrame = False

    def apply(self, raw_frame, detections=None):
        if self.method == 'orb' or self.method == 'sift':
            return self.applyFeaures(raw_frame, detections)
        elif self.method == 'ecc':
            return self.applyEcc(raw_frame, detections)
        elif self.method == 'sparseOptFlow':
            return self.applySparseOptFlow(raw_frame, detections)
        elif self.method == 'file':
            return self.applyFile(raw_frame, detections)
        elif self.method == 'none':
            return np.eye(2, 3)
        else:
            return np.eye(2, 3)

    def applyEcc(self, raw_frame, detections=None):

        # Initialize
        height, width, _ = raw_frame.shape
        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
        H = np.eye(2, 3, dtype=np.float32)

        # Downscale image (TODO: consider using pyramids)
        if self.downscale > 1.0:
            frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
            frame = cv2.resize(frame, (width // self.downscale,
                                       height // self.downscale))
            width = width // self.downscale
            height = height // self.downscale

        # Handle first frame
        if not self.initializedFirstFrame:
            # Initialize data
            self.prevFrame = frame.copy()

            # Initialization done
            self.initializedFirstFrame = True

            return H

        # Run the ECC algorithm. The results are stored in warp_matrix.
        # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria)
        try:
            (cc,
             H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode,
                                       self.criteria, None, 1)
        except:
            print('Warning: find transform failed. Set warp as identity')

        return H

    def applyFeaures(self, raw_frame, detections=None):

        # Initialize
        height, width, _ = raw_frame.shape
        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
        H = np.eye(2, 3)

        # Downscale image (TODO: consider using pyramids)
        if self.downscale > 1.0:
            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
            frame = cv2.resize(frame, (width // self.downscale,
                                       height // self.downscale))
            width = width // self.downscale
            height = height // self.downscale

        # find the keypoints
        mask = np.zeros_like(frame)
        # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255
        mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int(
            0.98 * width)] = 255
        if detections is not None:
            for det in detections:
                tlbr = (det[:4] / self.downscale).astype(np.int_)
                mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0

        keypoints = self.detector.detect(frame, mask)

        # compute the descriptors
        keypoints, descriptors = self.extractor.compute(frame, keypoints)

        # Handle first frame
        if not self.initializedFirstFrame:
            # Initialize data
            self.prevFrame = frame.copy()
            self.prevKeyPoints = copy.copy(keypoints)
            self.prevDescriptors = copy.copy(descriptors)

            # Initialization done
            self.initializedFirstFrame = True

            return H

        # Match descriptors.
        knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2)

        # Filtered matches based on smallest spatial distance
        matches = []
        spatialDistances = []

        maxSpatialDistance = 0.25 * np.array([width, height])

        # Handle empty matches case
        if len(knnMatches) == 0:
            # Store to next iteration
            self.prevFrame = frame.copy()
            self.prevKeyPoints = copy.copy(keypoints)
            self.prevDescriptors = copy.copy(descriptors)

            return H

        for m, n in knnMatches:
            if m.distance < 0.9 * n.distance:
                prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt
                currKeyPointLocation = keypoints[m.trainIdx].pt

                spatialDistance = (
                    prevKeyPointLocation[0] - currKeyPointLocation[0],
                    prevKeyPointLocation[1] - currKeyPointLocation[1])

                if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \
                        (np.abs(spatialDistance[1]) < maxSpatialDistance[1]):
                    spatialDistances.append(spatialDistance)
                    matches.append(m)

        meanSpatialDistances = np.mean(spatialDistances, 0)
        stdSpatialDistances = np.std(spatialDistances, 0)

        inliesrs = (spatialDistances - meanSpatialDistances
                    ) < 2.5 * stdSpatialDistances

        goodMatches = []
        prevPoints = []
        currPoints = []
        for i in range(len(matches)):
            if inliesrs[i, 0] and inliesrs[i, 1]:
                goodMatches.append(matches[i])
                prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt)
                currPoints.append(keypoints[matches[i].trainIdx].pt)

        prevPoints = np.array(prevPoints)
        currPoints = np.array(currPoints)

        # Draw the keypoint matches on the output image
        if 0:
            matches_img = np.hstack((self.prevFrame, frame))
            matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR)
            W = np.size(self.prevFrame, 1)
            for m in goodMatches:
                prev_pt = np.array(
                    self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_)
                curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_)
                curr_pt[0] += W
                color = np.random.randint(0, 255, (3, ))
                color = (int(color[0]), int(color[1]), int(color[2]))

                matches_img = cv2.line(matches_img, prev_pt, curr_pt,
                                       tuple(color), 1, cv2.LINE_AA)
                matches_img = cv2.circle(matches_img, prev_pt, 2,
                                         tuple(color), -1)
                matches_img = cv2.circle(matches_img, curr_pt, 2,
                                         tuple(color), -1)

            plt.figure()
            plt.imshow(matches_img)
            plt.show()

        # Find rigid matrix
        if (np.size(prevPoints, 0) > 4) and (
                np.size(prevPoints, 0) == np.size(prevPoints, 0)):
            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,
                                                      cv2.RANSAC)

            # Handle downscale
            if self.downscale > 1.0:
                H[0, 2] *= self.downscale
                H[1, 2] *= self.downscale
        else:
            print('Warning: not enough matching points')

        # Store to next iteration
        self.prevFrame = frame.copy()
        self.prevKeyPoints = copy.copy(keypoints)
        self.prevDescriptors = copy.copy(descriptors)

        return H

    def applySparseOptFlow(self, raw_frame, detections=None):

        t0 = time.time()

        # Initialize
        height, width, _ = raw_frame.shape
        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
        H = np.eye(2, 3)

        # Downscale image
        if self.downscale > 1.0:
            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
            frame = cv2.resize(frame, (width // self.downscale,
                                       height // self.downscale))

        # find the keypoints
        keypoints = cv2.goodFeaturesToTrack(
            frame, mask=None, **self.feature_params)

        # Handle first frame
        if not self.initializedFirstFrame:
            # Initialize data
            self.prevFrame = frame.copy()
            self.prevKeyPoints = copy.copy(keypoints)

            # Initialization done
            self.initializedFirstFrame = True

            return H

        if self.prevFrame.shape != frame.shape:
            self.prevFrame = frame.copy()
            self.prevKeyPoints = copy.copy(keypoints)
            return H

        # find correspondences
        matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK(
            self.prevFrame, frame, self.prevKeyPoints, None)

        # leave good correspondences only
        prevPoints = []
        currPoints = []

        for i in range(len(status)):
            if status[i]:
                prevPoints.append(self.prevKeyPoints[i])
                currPoints.append(matchedKeypoints[i])

        prevPoints = np.array(prevPoints)
        currPoints = np.array(currPoints)

        # Find rigid matrix
        if (np.size(prevPoints, 0) > 4) and (
                np.size(prevPoints, 0) == np.size(prevPoints, 0)):
            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,
                                                      cv2.RANSAC)

            # Handle downscale
            if self.downscale > 1.0:
                H[0, 2] *= self.downscale
                H[1, 2] *= self.downscale
        else:
            print('Warning: not enough matching points')

        # Store to next iteration
        self.prevFrame = frame.copy()
        self.prevKeyPoints = copy.copy(keypoints)

        t1 = time.time()

        # gmc_line = str(1000 * (t1 - t0)) + "\t" + str(H[0, 0]) + "\t" + str(H[0, 1]) + "\t" + str(
        #     H[0, 2]) + "\t" + str(H[1, 0]) + "\t" + str(H[1, 1]) + "\t" + str(H[1, 2]) + "\n"
        # self.gmc_file.write(gmc_line)

        return H

    def applyFile(self, raw_frame, detections=None):
        line = self.gmcFile.readline()
        tokens = line.split("\t")
        H = np.eye(2, 3, dtype=np.float_)
        H[0, 0] = float(tokens[1])
        H[0, 1] = float(tokens[2])
        H[0, 2] = float(tokens[3])
        H[1, 0] = float(tokens[4])
        H[1, 1] = float(tokens[5])
        H[1, 2] = float(tokens[6])

        return H


================================================
FILE: ppdet/modeling/mot/motion/kalman_filter.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py
"""

import numpy as np
import scipy.linalg

use_numba = True
try:
    import numba as nb

    @nb.njit(fastmath=True, cache=True)
    def nb_project(mean, covariance, std, _update_mat):
        innovation_cov = np.diag(np.square(std))
        mean = np.dot(_update_mat, mean)
        covariance = np.dot(np.dot(_update_mat, covariance), _update_mat.T)
        return mean, covariance + innovation_cov

    @nb.njit(fastmath=True, cache=True)
    def nb_multi_predict(mean, covariance, motion_cov, motion_mat):
        mean = np.dot(mean, motion_mat.T)
        left = np.dot(motion_mat, covariance)
        covariance = np.dot(left, motion_mat.T) + motion_cov
        return mean, covariance

    @nb.njit(fastmath=True, cache=True)
    def nb_update(mean, covariance, proj_mean, proj_cov, measurement, meas_mat):
        kalman_gain = np.linalg.solve(proj_cov, (covariance @meas_mat.T).T).T
        innovation = measurement - proj_mean
        mean = mean + innovation @kalman_gain.T
        covariance = covariance - kalman_gain @proj_cov @kalman_gain.T
        return mean, covariance

except:
    use_numba = False
    print(
        'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`'
    )
    pass

__all__ = ['KalmanFilter']
"""
Table for the 0.95 quantile of the chi-square distribution with N degrees of
freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
function and used as Mahalanobis gating threshold.
"""

chi2inv95 = {
    1: 3.8415,
    2: 5.9915,
    3: 7.8147,
    4: 9.4877,
    5: 11.070,
    6: 12.592,
    7: 14.067,
    8: 15.507,
    9: 16.919
}


class KalmanFilter(object):
    """
    A simple Kalman filter for tracking bounding boxes in image space.

    The 8-dimensional state space

        x, y, a, h, vx, vy, va, vh

    contains the bounding box center position (x, y), aspect ratio a, height h,
    and their respective velocities.

    Object motion follows a constant velocity model. The bounding box location
    (x, y, a, h) is taken as direct observation of the state space (linear
    observation model).

    """

    def __init__(self):
        ndim, dt = 4, 1.

        # Create Kalman filter model matrices.
        self._motion_mat = np.eye(2 * ndim, 2 * ndim, dtype=np.float32)
        for i in range(ndim):
            self._motion_mat[i, ndim + i] = dt
        self._update_mat = np.eye(ndim, 2 * ndim, dtype=np.float32)

        # Motion and observation uncertainty are chosen relative to the current
        # state estimate. These weights control the amount of uncertainty in
        # the model. This is a bit hacky.
        self._std_weight_position = 1. / 20
        self._std_weight_velocity = 1. / 160

    def initiate(self, measurement):
        """
        Create track from unassociated measurement.

        Args:
            measurement (ndarray): Bounding box coordinates (x, y, a, h) with
                center position (x, y), aspect ratio a, and height h.

        Returns:
            The mean vector (8 dimensional) and covariance matrix (8x8
            dimensional) of the new track. Unobserved velocities are 
            initialized to 0 mean.
        """
        mean_pos = measurement
        mean_vel = np.zeros_like(mean_pos)
        mean = np.r_[mean_pos, mean_vel]

        std = [
            2 * self._std_weight_position * measurement[3],
            2 * self._std_weight_position * measurement[3], 1e-2,
            2 * self._std_weight_position * measurement[3],
            10 * self._std_weight_velocity * measurement[3],
            10 * self._std_weight_velocity * measurement[3], 1e-5,
            10 * self._std_weight_velocity * measurement[3]
        ]
        covariance = np.diag(np.square(std))
        return mean, np.float32(covariance)

    def predict(self, mean, covariance):
        """
        Run Kalman filter prediction step.

        Args:
            mean (ndarray): The 8 dimensional mean vector of the object state
                at the previous time step.
            covariance (ndarray): The 8x8 dimensional covariance matrix of the
                object state at the previous time step.

        Returns:
            The mean vector and covariance matrix of the predicted state. 
            Unobserved velocities are initialized to 0 mean.
        """
        std_pos = [
            self._std_weight_position * mean[3], self._std_weight_position *
            mean[3], 1e-2, self._std_weight_position * mean[3]
        ]
        std_vel = [
            self._std_weight_velocity * mean[3], self._std_weight_velocity *
            mean[3], 1e-5, self._std_weight_velocity * mean[3]
        ]
        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))

        #mean = np.dot(self._motion_mat, mean)
        mean = np.dot(mean, self._motion_mat.T)
        covariance = np.linalg.multi_dot(
            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov

        return mean, covariance

    def project(self, mean, covariance):
        """
        Project state distribution to measurement space.

        Args
            mean (ndarray): The state's mean vector (8 dimensional array).
            covariance (ndarray): The state's covariance matrix (8x8 dimensional).

        Returns:
            The projected mean and covariance matrix of the given state estimate.
        """
        std = np.array(
            [
                self._std_weight_position * mean[3], self._std_weight_position *
                mean[3], 1e-1, self._std_weight_position * mean[3]
            ],
            dtype=np.float32)

        if use_numba:
            return nb_project(mean, covariance, std, self._update_mat)

        innovation_cov = np.diag(np.square(std))

        mean = np.dot(self._update_mat, mean)
        covariance = np.linalg.multi_dot((self._update_mat, covariance,
                                          self._update_mat.T))
        return mean, covariance + innovation_cov

    def multi_predict(self, mean, covariance):
        """
        Run Kalman filter prediction step (Vectorized version).
        
        Args:
            mean (ndarray): The Nx8 dimensional mean matrix of the object states
                at the previous time step.
            covariance (ndarray): The Nx8x8 dimensional covariance matrics of the
                object states at the previous time step.

        Returns:
            The mean vector and covariance matrix of the predicted state.
            Unobserved velocities are initialized to 0 mean.
        """
        std_pos = np.array([
            self._std_weight_position * mean[:, 3], self._std_weight_position *
            mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]),
            self._std_weight_position * mean[:, 3]
        ])
        std_vel = np.array([
            self._std_weight_velocity * mean[:, 3], self._std_weight_velocity *
            mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]),
            self._std_weight_velocity * mean[:, 3]
        ])
        sqr = np.square(np.r_[std_pos, std_vel]).T

        if use_numba:

            means = []
            covariances = []
            for i in range(len(mean)):
                a, b = nb_multi_predict(mean[i], covariance[i],
                                        np.diag(sqr[i]), self._motion_mat)
                means.append(a)
                covariances.append(b)
            return np.asarray(means), np.asarray(covariances)

        motion_cov = []
        for i in range(len(mean)):
            motion_cov.append(np.diag(sqr[i]))
        motion_cov = np.asarray(motion_cov)

        mean = np.dot(mean, self._motion_mat.T)
        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
        covariance = np.dot(left, self._motion_mat.T) + motion_cov

        return mean, covariance

    def update(self, mean, covariance, measurement):
        """
        Run Kalman filter correction step.

        Args:
            mean (ndarray): The predicted state's mean vector (8 dimensional).
            covariance (ndarray): The state's covariance matrix (8x8 dimensional).
            measurement (ndarray): The 4 dimensional measurement vector
                (x, y, a, h), where (x, y) is the center position, a the aspect
                ratio, and h the height of the bounding box.

        Returns:
            The measurement-corrected state distribution.
        """
        projected_mean, projected_cov = self.project(mean, covariance)

        if use_numba:

            return nb_update(mean, covariance, projected_mean, projected_cov,
                             measurement, self._update_mat)

        kalman_gain = np.linalg.solve(projected_cov,
                                      (covariance @self._update_mat.T).T).T
        innovation = measurement - projected_mean
        mean = mean + innovation @kalman_gain.T
        covariance = covariance - kalman_gain @projected_cov @kalman_gain.T
        return mean, covariance

    def gating_distance(self,
                        mean,
                        covariance,
                        measurements,
                        only_position=False,
                        metric='maha'):
        """
        Compute gating distance between state distribution and measurements.
        A suitable distance threshold can be obtained from `chi2inv95`. If
        `only_position` is False, the chi-square distribution has 4 degrees of
        freedom, otherwise 2.
        
        Args:
            mean (ndarray): Mean vector over the state distribution (8
                dimensional).
            covariance (ndarray): Covariance of the state distribution (8x8
                dimensional).
            measurements (ndarray): An Nx4 dimensional matrix of N measurements,
                each in format (x, y, a, h) where (x, y) is the bounding box center
                position, a the aspect ratio, and h the height.
            only_position (Optional[bool]): If True, distance computation is 
                done with respect to the bounding box center position only.
            metric (str): Metric type, 'gaussian' or 'maha'.

        Returns
            An array of length N, where the i-th element contains the squared
            Mahalanobis distance between (mean, covariance) and `measurements[i]`.
        """
        mean, covariance = self.project(mean, covariance)
        if only_position:
            mean, covariance = mean[:2], covariance[:2, :2]
            measurements = measurements[:, :2]

        d = measurements - mean
        if metric == 'gaussian':
            return np.sum(d * d, axis=1)
        elif metric == 'maha':
            cholesky_factor = np.linalg.cholesky(covariance)
            z = scipy.linalg.solve_triangular(
                cholesky_factor,
                d.T,
                lower=True,
                check_finite=False,
                overwrite_b=True)
            squared_maha = np.sum(z * z, axis=0)
            return squared_maha
        else:
            raise ValueError('invalid distance metric')


================================================
FILE: ppdet/modeling/mot/motion/ocsort_kalman_filter.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/danbochman/SORT/blob/danny_opencv/kalman_filter.py
"""

import numpy as np
from numpy import dot, zeros, eye
from numpy.linalg import inv

use_numba = True
try:
    import numba as nb

    @nb.njit(fastmath=True, cache=True)
    def nb_predict(x, F, P, Q):
        x = dot(F, x)
        P = dot(dot(F, P), F.T) + Q
        return x, P

    @nb.njit(fastmath=True, cache=True)
    def nb_update(x, z, H, P, R, _I):

        y = z - np.dot(H, x)
        PHT = dot(P, H.T)

        S = dot(H, PHT) + R
        K = dot(PHT, inv(S))

        x = x + dot(K, y)

        I_KH = _I - dot(K, H)
        P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T)
        return x, P
except:
    use_numba = False
    print(
        'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`'
    )
    pass


class OCSORTKalmanFilter:
    def __init__(self, dim_x, dim_z):
        self.dim_x = dim_x
        self.dim_z = dim_z
        self.x = zeros((dim_x, 1))
        self.P = eye(dim_x)
        self.Q = eye(dim_x)
        self.F = eye(dim_x)
        self.H = zeros((dim_z, dim_x))
        self.R = eye(dim_z)
        self.M = zeros((dim_z, dim_z))

        self._I = eye(dim_x)

    def predict(self):
        if use_numba:
            self.x, self.P = nb_predict(self.x, self.F, self.P, self.Q)
        else:
            self.x = dot(self.F, self.x)
            self.P = dot(dot(self.F, self.P), self.F.T) + self.Q

    def update(self, z):

        if z is None:
            return

        if use_numba:
            self.x, self.P = nb_update(self.x, z, self.H, self.P, self.R,
                                       self._I)
        else:
            y = z - np.dot(self.H, self.x)
            PHT = dot(self.P, self.H.T)

            S = dot(self.H, PHT) + self.R
            K = dot(PHT, inv(S))

            self.x = self.x + dot(K, y)

            I_KH = self._I - dot(K, self.H)
            self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(K, self.R), K.T)


================================================
FILE: ppdet/modeling/mot/tracker/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import base_jde_tracker
from . import base_sde_tracker

from .base_jde_tracker import *
from .base_sde_tracker import *

from . import jde_tracker
from . import deepsort_tracker
from . import ocsort_tracker
from . import center_tracker

from .jde_tracker import *
from .deepsort_tracker import *
from .ocsort_tracker import *
from .botsort_tracker import *
from .center_tracker import *


================================================
FILE: ppdet/modeling/mot/tracker/base_jde_tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
"""

import numpy as np
from collections import defaultdict
from collections import deque, OrderedDict
from ..matching import jde_matching as matching
from ppdet.core.workspace import register, serializable
import warnings
warnings.filterwarnings("ignore")

__all__ = [
    'TrackState',
    'BaseTrack',
    'STrack',
    'joint_stracks',
    'sub_stracks',
    'remove_duplicate_stracks',
]


class TrackState(object):
    New = 0
    Tracked = 1
    Lost = 2
    Removed = 3


@register
@serializable
class BaseTrack(object):
    _count_dict = defaultdict(int)  # support single class and multi classes

    track_id = 0
    is_activated = False
    state = TrackState.New

    history = OrderedDict()
    features = []
    curr_feat = None
    score = 0
    start_frame = 0
    frame_id = 0
    time_since_update = 0

    # multi-camera
    location = (np.inf, np.inf)

    @property
    def end_frame(self):
        return self.frame_id

    @staticmethod
    def next_id(cls_id):
        BaseTrack._count_dict[cls_id] += 1
        return BaseTrack._count_dict[cls_id]

    # @even: reset track id
    @staticmethod
    def init_count(num_classes):
        """
        Initiate _count for all object classes
        :param num_classes:
        """
        for cls_id in range(num_classes):
            BaseTrack._count_dict[cls_id] = 0

    @staticmethod
    def reset_track_count(cls_id):
        BaseTrack._count_dict[cls_id] = 0

    def activate(self, *args):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

    def update(self, *args, **kwargs):
        raise NotImplementedError

    def mark_lost(self):
        self.state = TrackState.Lost

    def mark_removed(self):
        self.state = TrackState.Removed


@register
@serializable
class STrack(BaseTrack):
    def __init__(self, tlwh, score, cls_id, buff_size=30, temp_feat=None):
        # wait activate
        self._tlwh = np.asarray(tlwh, dtype=np.float32)
        self.score = score
        self.cls_id = cls_id
        self.track_len = 0

        self.kalman_filter = None
        self.mean, self.covariance = None, None
        self.is_activated = False

        self.use_reid = True if temp_feat is not None else False
        if self.use_reid:
            self.smooth_feat = None
            self.update_features(temp_feat)
            self.features = deque([], maxlen=buff_size)
            self.alpha = 0.9

    def update_features(self, feat):
        # L2 normalizing, this function has no use for BYTETracker
        feat /= np.linalg.norm(feat)
        self.curr_feat = feat
        if self.smooth_feat is None:
            self.smooth_feat = feat
        else:
            self.smooth_feat = self.alpha * self.smooth_feat + (1.0 - self.alpha
                                                                ) * feat
        self.features.append(feat)
        self.smooth_feat /= np.linalg.norm(self.smooth_feat)

    def predict(self):
        mean_state = self.mean.copy()
        if self.state != TrackState.Tracked:
            mean_state[7] = 0
        self.mean, self.covariance = self.kalman_filter.predict(mean_state,
                                                                self.covariance)

    @staticmethod
    def multi_predict(tracks, kalman_filter):
        if len(tracks) > 0:
            multi_mean = np.asarray([track.mean.copy() for track in tracks])
            multi_covariance = np.asarray(
                [track.covariance for track in tracks])
            for i, st in enumerate(tracks):
                if st.state != TrackState.Tracked:
                    multi_mean[i][7] = 0
            multi_mean, multi_covariance = kalman_filter.multi_predict(
                multi_mean, multi_covariance)
            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
                tracks[i].mean = mean
                tracks[i].covariance = cov

    @staticmethod
    def multi_gmc(stracks, H=np.eye(2, 3)):
        if len(stracks) > 0:
            multi_mean = np.asarray([st.mean.copy() for st in stracks])
            multi_covariance = np.asarray([st.covariance for st in stracks])

            R = H[:2, :2]
            R8x8 = np.kron(np.eye(4, dtype=float), R)
            t = H[:2, 2]

            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
                mean = R8x8.dot(mean)
                mean[:2] += t
                cov = R8x8.dot(cov).dot(R8x8.transpose())

                stracks[i].mean = mean
                stracks[i].covariance = cov

    def reset_track_id(self):
        self.reset_track_count(self.cls_id)

    def activate(self, kalman_filter, frame_id):
        """Start a new track"""
        self.kalman_filter = kalman_filter
        # update track id for the object class
        self.track_id = self.next_id(self.cls_id)
        self.mean, self.covariance = self.kalman_filter.initiate(
            self.tlwh_to_xyah(self._tlwh))

        self.track_len = 0
        self.state = TrackState.Tracked  # set flag 'tracked'

        if frame_id == 1:  # to record the first frame's detection result
            self.is_activated = True

        self.frame_id = frame_id
        self.start_frame = frame_id

    def re_activate(self, new_track, frame_id, new_id=False):
        self.mean, self.covariance = self.kalman_filter.update(
            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh))
        if self.use_reid:
            self.update_features(new_track.curr_feat)
        self.track_len = 0
        self.state = TrackState.Tracked
        self.is_activated = True
        self.frame_id = frame_id
        if new_id:  # update track id for the object class
            self.track_id = self.next_id(self.cls_id)

    def update(self, new_track, frame_id, update_feature=True):
        self.frame_id = frame_id
        self.track_len += 1

        new_tlwh = new_track.tlwh
        self.mean, self.covariance = self.kalman_filter.update(
            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))
        self.state = TrackState.Tracked  # set flag 'tracked'
        self.is_activated = True  # set flag 'activated'

        self.score = new_track.score
        if update_feature and self.use_reid:
            self.update_features(new_track.curr_feat)

    @property
    def tlwh(self):
        """Get current position in bounding box format `(top left x, top left y,
                width, height)`.
        """
        if self.mean is None:
            return self._tlwh.copy()

        ret = self.mean[:4].copy()
        ret[2] *= ret[3]
        ret[:2] -= ret[2:] / 2
        return ret

    @property
    def tlbr(self):
        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
        `(top left, bottom right)`.
        """
        ret = self.tlwh.copy()
        ret[2:] += ret[:2]
        return ret

    @staticmethod
    def tlwh_to_xyah(tlwh):
        """Convert bounding box to format `(center x, center y, aspect ratio,
        height)`, where the aspect ratio is `width / height`.
        """
        ret = np.asarray(tlwh).copy()
        ret[:2] += ret[2:] / 2
        ret[2] /= ret[3]
        return ret

    def to_xyah(self):
        return self.tlwh_to_xyah(self.tlwh)

    @staticmethod
    def tlbr_to_tlwh(tlbr):
        ret = np.asarray(tlbr).copy()
        ret[2:] -= ret[:2]
        return ret

    @staticmethod
    def tlwh_to_tlbr(tlwh):
        ret = np.asarray(tlwh).copy()
        ret[2:] += ret[:2]
        return ret

    def __repr__(self):
        return 'OT_({}-{})_({}-{})'.format(self.cls_id, self.track_id,
                                           self.start_frame, self.end_frame)


def joint_stracks(tlista, tlistb):
    exists = {}
    res = []
    for t in tlista:
        exists[t.track_id] = 1
        res.append(t)
    for t in tlistb:
        tid = t.track_id
        if not exists.get(tid, 0):
            exists[tid] = 1
            res.append(t)
    return res


def sub_stracks(tlista, tlistb):
    stracks = {}
    for t in tlista:
        stracks[t.track_id] = t
    for t in tlistb:
        tid = t.track_id
        if stracks.get(tid, 0):
            del stracks[tid]
    return list(stracks.values())


def remove_duplicate_stracks(stracksa, stracksb):
    pdist = matching.iou_distance(stracksa, stracksb)
    pairs = np.where(pdist < 0.15)
    dupa, dupb = list(), list()
    for p, q in zip(*pairs):
        timep = stracksa[p].frame_id - stracksa[p].start_frame
        timeq = stracksb[q].frame_id - stracksb[q].start_frame
        if timep > timeq:
            dupb.append(q)
        else:
            dupa.append(p)
    resa = [t for i, t in enumerate(stracksa) if not i in dupa]
    resb = [t for i, t in enumerate(stracksb) if not i in dupb]
    return resa, resb


================================================
FILE: ppdet/modeling/mot/tracker/base_sde_tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py
"""

import datetime
from ppdet.core.workspace import register, serializable

__all__ = ['TrackState', 'Track']


class TrackState(object):
    """
    Enumeration type for the single target track state. Newly created tracks are
    classified as `tentative` until enough evidence has been collected. Then,
    the track state is changed to `confirmed`. Tracks that are no longer alive
    are classified as `deleted` to mark them for removal from the set of active
    tracks.
    """
    Tentative = 1
    Confirmed = 2
    Deleted = 3


@register
@serializable
class Track(object):
    """
    A single target track with state space `(x, y, a, h)` and associated
    velocities, where `(x, y)` is the center of the bounding box, `a` is the
    aspect ratio and `h` is the height.

    Args:
        mean (ndarray): Mean vector of the initial state distribution.
        covariance (ndarray): Covariance matrix of the initial state distribution.
        track_id (int): A unique track identifier.
        n_init (int): Number of consecutive detections before the track is confirmed.
            The track state is set to `Deleted` if a miss occurs within the first
            `n_init` frames.
        max_age (int): The maximum number of consecutive misses before the track
            state is set to `Deleted`.
        cls_id (int): The category id of the tracked box.
        score (float): The confidence score of the tracked box.
        feature (Optional[ndarray]): Feature vector of the detection this track
            originates from. If not None, this feature is added to the `features` cache.

    Attributes:
        hits (int): Total number of measurement updates.
        age (int): Total number of frames since first occurance.
        time_since_update (int): Total number of frames since last measurement
            update.
        state (TrackState): The current track state.
        features (List[ndarray]): A cache of features. On each measurement update,
            the associated feature vector is added to this list.
    """

    def __init__(self,
                 mean,
                 covariance,
                 track_id,
                 n_init,
                 max_age,
                 cls_id,
                 score,
                 feature=None):
        self.mean = mean
        self.covariance = covariance
        self.track_id = track_id
        self.hits = 1
        self.age = 1
        self.time_since_update = 0
        self.cls_id = cls_id
        self.score = score
        self.start_time = datetime.datetime.now()

        self.state = TrackState.Tentative
        self.features = []
        self.feat = feature
        if feature is not None:
            self.features.append(feature)

        self._n_init = n_init
        self._max_age = max_age

    def to_tlwh(self):
        """Get position in format `(top left x, top left y, width, height)`."""
        ret = self.mean[:4].copy()
        ret[2] *= ret[3]
        ret[:2] -= ret[2:] / 2
        return ret

    def to_tlbr(self):
        """Get position in bounding box format `(min x, miny, max x, max y)`."""
        ret = self.to_tlwh()
        ret[2:] = ret[:2] + ret[2:]
        return ret

    def predict(self, kalman_filter):
        """
        Propagate the state distribution to the current time step using a Kalman
        filter prediction step.
        """
        self.mean, self.covariance = kalman_filter.predict(self.mean,
                                                           self.covariance)
        self.age += 1
        self.time_since_update += 1

    def update(self, kalman_filter, detection):
        """
        Perform Kalman filter measurement update step and update the associated
        detection feature cache.
        """
        self.mean, self.covariance = kalman_filter.update(self.mean,
                                                          self.covariance,
                                                          detection.to_xyah())
        self.features.append(detection.feature)
        self.feat = detection.feature
        self.cls_id = detection.cls_id
        self.score = detection.score

        self.hits += 1
        self.time_since_update = 0
        if self.state == TrackState.Tentative and self.hits >= self._n_init:
            self.state = TrackState.Confirmed

    def mark_missed(self):
        """Mark this track as missed (no association at the current time step).
        """
        if self.state == TrackState.Tentative:
            self.state = TrackState.Deleted
        elif self.time_since_update > self._max_age:
            self.state = TrackState.Deleted

    def is_tentative(self):
        """Returns True if this track is tentative (unconfirmed)."""
        return self.state == TrackState.Tentative

    def is_confirmed(self):
        """Returns True if this track is confirmed."""
        return self.state == TrackState.Confirmed

    def is_deleted(self):
        """Returns True if this track is dead and should be deleted."""
        return self.state == TrackState.Deleted


================================================
FILE: ppdet/modeling/mot/tracker/botsort_tracker.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/bot_sort.py
"""

import cv2
import matplotlib.pyplot as plt
import numpy as np
from collections import deque

from ..matching import jde_matching as matching
from ..motion import GMC
from .base_jde_tracker import TrackState, STrack
from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks
from ..motion import KalmanFilter

from ppdet.core.workspace import register, serializable


@register
@serializable
class BOTSORTTracker(object):
    """
    BOTSORT tracker, support single class

    Args:
        track_high_thresh (float): threshold of detection high score
        track_low_thresh (float): threshold of remove detection score
        new_track_thresh (float): threshold of new track score
        match_thresh (float): iou threshold for associate
        track_buffer (int): tracking reserved frames,default 30
        min_box_area (float): reserved min box
        camera_motion (bool): Whether use camera motion, default False
        cmc_method (str): camera motion method,defalut sparseOptFlow
        frame_rate (int): fps buffer_size=int(frame_rate / 30.0 * track_buffer)
    """

    def __init__(self,
                 track_high_thresh=0.3,
                 track_low_thresh=0.2,
                 new_track_thresh=0.4,
                 match_thresh=0.7,
                 track_buffer=30,
                 min_box_area=0,
                 camera_motion=False,
                 cmc_method='sparseOptFlow',
                 frame_rate=30):

        self.tracked_stracks = []  # type: list[STrack]
        self.lost_stracks = []  # type: list[STrack]
        self.removed_stracks = []  # type: list[STrack]

        self.frame_id = 0

        self.track_high_thresh = track_high_thresh
        self.track_low_thresh = track_low_thresh
        self.new_track_thresh = new_track_thresh
        self.match_thresh = match_thresh
        self.buffer_size = int(frame_rate / 30.0 * track_buffer)
        self.max_time_lost = self.buffer_size
        self.kalman_filter = KalmanFilter()
        self.min_box_area = min_box_area

        self.camera_motion = camera_motion
        self.gmc = GMC(method=cmc_method)

    def update(self, output_results, img=None):
        self.frame_id += 1
        activated_starcks = []
        refind_stracks = []
        lost_stracks = []
        removed_stracks = []

        if len(output_results):
            bboxes = output_results[:, 2:6]
            scores = output_results[:, 1]
            classes = output_results[:, 0]

            # Remove bad detections
            lowest_inds = scores > self.track_low_thresh
            bboxes = bboxes[lowest_inds]
            scores = scores[lowest_inds]
            classes = classes[lowest_inds]

            # Find high threshold detections
            remain_inds = scores > self.track_high_thresh
            dets = bboxes[remain_inds]
            scores_keep = scores[remain_inds]
            classes_keep = classes[remain_inds]

        else:
            bboxes = []
            scores = []
            classes = []
            dets = []
            scores_keep = []
            classes_keep = []

        if len(dets) > 0:
            '''Detections'''
            detections = [
                STrack(STrack.tlbr_to_tlwh(tlbr), s, c)
                for (tlbr, s, c) in zip(dets, scores_keep, classes_keep)
            ]
        else:
            detections = []
        ''' Add newly detected tracklets to tracked_stracks'''
        unconfirmed = []
        tracked_stracks = []  # type: list[STrack]
        for track in self.tracked_stracks:
            if not track.is_activated:
                unconfirmed.append(track)
            else:
                tracked_stracks.append(track)
        ''' Step 2: First association, with high score detection boxes'''
        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)

        # Predict the current location with KF
        STrack.multi_predict(strack_pool, self.kalman_filter)

        # Fix camera motion
        if self.camera_motion:
            warp = self.gmc.apply(img[0], dets)
            STrack.multi_gmc(strack_pool, warp)
            STrack.multi_gmc(unconfirmed, warp)

        # Associate with high score detection boxes
        ious_dists = matching.iou_distance(strack_pool, detections)
        matches, u_track, u_detection = matching.linear_assignment(
            ious_dists, thresh=self.match_thresh)

        for itracked, idet in matches:
            track = strack_pool[itracked]
            det = detections[idet]
            if track.state == TrackState.Tracked:
                track.update(detections[idet], self.frame_id)
                activated_starcks.append(track)
            else:
                track.re_activate(det, self.frame_id, new_id=False)
                refind_stracks.append(track)
        ''' Step 3: Second association, with low score detection boxes'''
        if len(scores):
            inds_high = scores < self.track_high_thresh
            inds_low = scores > self.track_low_thresh
            inds_second = np.logical_and(inds_low, inds_high)
            dets_second = bboxes[inds_second]
            scores_second = scores[inds_second]
            classes_second = classes[inds_second]
        else:
            dets_second = []
            scores_second = []
            classes_second = []

        # association the untrack to the low score detections
        if len(dets_second) > 0:
            '''Detections'''
            detections_second = [
                STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in
                zip(dets_second, scores_second, classes_second)
            ]
        else:
            detections_second = []

        r_tracked_stracks = [
            strack_pool[i] for i in u_track
            if strack_pool[i].state == TrackState.Tracked
        ]
        dists = matching.iou_distance(r_tracked_stracks, detections_second)
        matches, u_track, u_detection_second = matching.linear_assignment(
            dists, thresh=0.5)
        for itracked, idet in matches:
            track = r_tracked_stracks[itracked]
            det = detections_second[idet]
            if track.state == TrackState.Tracked:
                track.update(det, self.frame_id)
                activated_starcks.append(track)
            else:
                track.re_activate(det, self.frame_id, new_id=False)
                refind_stracks.append(track)

        for it in u_track:
            track = r_tracked_stracks[it]
            if not track.state == TrackState.Lost:
                track.mark_lost()
                lost_stracks.append(track)
        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
        detections = [detections[i] for i in u_detection]
        dists = matching.iou_distance(unconfirmed, detections)

        matches, u_unconfirmed, u_detection = matching.linear_assignment(
            dists, thresh=0.7)
        for itracked, idet in matches:
            unconfirmed[itracked].update(detections[idet], self.frame_id)
            activated_starcks.append(unconfirmed[itracked])
        for it in u_unconfirmed:
            track = unconfirmed[it]
            track.mark_removed()
            removed_stracks.append(track)
        """ Step 4: Init new stracks"""
        for inew in u_detection:
            track = detections[inew]
            if track.score < self.new_track_thresh:
                continue

            track.activate(self.kalman_filter, self.frame_id)
            activated_starcks.append(track)
        """ Step 5: Update state"""
        for track in self.lost_stracks:
            if self.frame_id - track.end_frame > self.max_time_lost:
                track.mark_removed()
                removed_stracks.append(track)
        """ Merge """
        self.tracked_stracks = [
            t for t in self.tracked_stracks if t.state == TrackState.Tracked
        ]
        self.tracked_stracks = joint_stracks(self.tracked_stracks,
                                             activated_starcks)
        self.tracked_stracks = joint_stracks(self.tracked_stracks,
                                             refind_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
        self.lost_stracks.extend(lost_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
        self.removed_stracks.extend(removed_stracks)
        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
            self.tracked_stracks, self.lost_stracks)

        # output_stracks = [track for track in self.tracked_stracks if track.is_activated]
        output_stracks = [track for track in self.tracked_stracks]

        return output_stracks


================================================
FILE: ppdet/modeling/mot/tracker/center_tracker.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/xingyizhou/CenterTrack/blob/master/src/lib/utils/tracker.py
"""

import copy
import numpy as np
import sklearn

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['CenterTracker']


@register
@serializable
class CenterTracker(object):
    __shared__ = ['num_classes']

    def __init__(self,
                 num_classes=1,
                 min_box_area=0,
                 vertical_ratio=-1,
                 track_thresh=0.4,
                 pre_thresh=0.5,
                 new_thresh=0.4,
                 out_thresh=0.4,
                 hungarian=False):
        self.num_classes = num_classes
        self.min_box_area = min_box_area
        self.vertical_ratio = vertical_ratio

        self.track_thresh = track_thresh
        self.pre_thresh = max(track_thresh, pre_thresh)
        self.new_thresh = max(track_thresh, new_thresh)
        self.out_thresh = max(track_thresh, out_thresh)
        self.hungarian = hungarian

        self.reset()

    def init_track(self, results):
        print('Initialize tracking!')
        for item in results:
            if item['score'] > self.new_thresh:
                self.id_count += 1
                item['tracking_id'] = self.id_count
                if not ('ct' in item):
                    bbox = item['bbox']
                    item['ct'] = [(bbox[0] + bbox[2]) / 2,
                                  (bbox[1] + bbox[3]) / 2]
                self.tracks.append(item)

    def reset(self):
        self.id_count = 0
        self.tracks = []

    def update(self, results, public_det=None):
        N = len(results)
        M = len(self.tracks)

        dets = np.array([det['ct'] + det['tracking'] for det in results],
                        np.float32)  # N x 2
        track_size = np.array([((track['bbox'][2] - track['bbox'][0]) * \
            (track['bbox'][3] - track['bbox'][1])) \
            for track in self.tracks], np.float32) # M
        track_cat = np.array([track['class'] for track in self.tracks],
                             np.int32)  # M
        item_size = np.array([((item['bbox'][2] - item['bbox'][0]) * \
            (item['bbox'][3] - item['bbox'][1])) \
            for item in results], np.float32) # N
        item_cat = np.array([item['class'] for item in results], np.int32)  # N
        tracks = np.array([pre_det['ct'] for pre_det in self.tracks],
                          np.float32)  # M x 2
        dist = (((tracks.reshape(1, -1, 2) - \
            dets.reshape(-1, 1, 2)) ** 2).sum(axis=2)) # N x M

        invalid = ((dist > track_size.reshape(1, M)) + \
            (dist > item_size.reshape(N, 1)) + \
            (item_cat.reshape(N, 1) != track_cat.reshape(1, M))) > 0
        dist = dist + invalid * 1e18

        if self.hungarian:
            item_score = np.array([item['score'] for item in results],
                                  np.float32)
            dist[dist > 1e18] = 1e18
            from sklearn.utils.linear_assignment_ import linear_assignment
            matched_indices = linear_assignment(dist)
        else:
            matched_indices = greedy_assignment(copy.deepcopy(dist))

        unmatched_dets = [d for d in range(dets.shape[0]) \
            if not (d in matched_indices[:, 0])]
        unmatched_tracks = [d for d in range(tracks.shape[0]) \
            if not (d in matched_indices[:, 1])]

        if self.hungarian:
            matches = []
            for m in matched_indices:
                if dist[m[0], m[1]] > 1e16:
                    unmatched_dets.append(m[0])
                    unmatched_tracks.append(m[1])
                else:
                    matches.append(m)
            matches = np.array(matches).reshape(-1, 2)
        else:
            matches = matched_indices

        ret = []
        for m in matches:
            track = results[m[0]]
            track['tracking_id'] = self.tracks[m[1]]['tracking_id']
            ret.append(track)

        # Private detection: create tracks for all un-matched detections
        for i in unmatched_dets:
            track = results[i]
            if track['score'] > self.new_thresh:
                self.id_count += 1
                track['tracking_id'] = self.id_count
                ret.append(track)

        self.tracks = ret
        return ret


def greedy_assignment(dist):
    matched_indices = []
    if dist.shape[1] == 0:
        return np.array(matched_indices, np.int32).reshape(-1, 2)
    for i in range(dist.shape[0]):
        j = dist[i].argmin()
        if dist[i][j] < 1e16:
            dist[:, j] = 1e18
            matched_indices.append([i, j])
    return np.array(matched_indices, np.int32).reshape(-1, 2)


================================================
FILE: ppdet/modeling/mot/tracker/deepsort_tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py
"""

import numpy as np

from ..motion import KalmanFilter
from ..matching.deepsort_matching import NearestNeighborDistanceMetric
from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix
from .base_sde_tracker import Track
from ..utils import Detection

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['DeepSORTTracker']


@register
@serializable
class DeepSORTTracker(object):
    """
    DeepSORT tracker

    Args:
        input_size (list): input feature map size to reid model, [h, w] format,
            [64, 192] as default.
        min_box_area (int): min box area to filter out low quality boxes
        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
            bad results, set 1.6 default for pedestrian tracking. If set <=0
            means no need to filter bboxes.
        budget (int): If not None, fix samples per class to at most this number.
            Removes the oldest samples when the budget is reached.
        max_age (int): maximum number of missed misses before a track is deleted
        n_init (float): Number of frames that a track remains in initialization
            phase. Number of consecutive detections before the track is confirmed. 
            The track state is set to `Deleted` if a miss occurs within the first 
            `n_init` frames.
        metric_type (str): either "euclidean" or "cosine", the distance metric 
            used for measurement to track association.
        matching_threshold (float): samples with larger distance are 
            considered an invalid match.
        max_iou_distance (float): max iou distance threshold
        motion (object): KalmanFilter instance
    """

    def __init__(self,
                 input_size=[64, 192],
                 min_box_area=0,
                 vertical_ratio=-1,
                 budget=100,
                 max_age=70,
                 n_init=3,
                 metric_type='cosine',
                 matching_threshold=0.2,
                 max_iou_distance=0.9,
                 motion='KalmanFilter'):
        self.input_size = input_size
        self.min_box_area = min_box_area
        self.vertical_ratio = vertical_ratio
        self.max_age = max_age
        self.n_init = n_init
        self.metric = NearestNeighborDistanceMetric(metric_type,
                                                    matching_threshold, budget)
        self.max_iou_distance = max_iou_distance
        if motion == 'KalmanFilter':
            self.motion = KalmanFilter()

        self.tracks = []
        self._next_id = 1

    def predict(self):
        """
        Propagate track state distributions one time step forward.
        This function should be called once every time step, before `update`.
        """
        for track in self.tracks:
            track.predict(self.motion)

    def update(self, pred_dets, pred_embs):
        """
        Perform measurement update and track management.
        Args:
            pred_dets (np.array): Detection results of the image, the shape is
                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
            pred_embs (np.array): Embedding results of the image, the shape is
                [N, 128], usually pred_embs.shape[1] is a multiple of 128.
        """
        pred_cls_ids = pred_dets[:, 0:1]
        pred_scores = pred_dets[:, 1:2]
        pred_xyxys = pred_dets[:, 2:6]
        pred_tlwhs = np.concatenate((pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), axis=1)

        detections = [
            Detection(tlwh, score, feat, cls_id)
            for tlwh, score, feat, cls_id in zip(pred_tlwhs, pred_scores,
                                                 pred_embs, pred_cls_ids)
        ]

        # Run matching cascade.
        matches, unmatched_tracks, unmatched_detections = \
            self._match(detections)

        # Update track set.
        for track_idx, detection_idx in matches:
            self.tracks[track_idx].update(self.motion,
                                          detections[detection_idx])
        for track_idx in unmatched_tracks:
            self.tracks[track_idx].mark_missed()
        for detection_idx in unmatched_detections:
            self._initiate_track(detections[detection_idx])
        self.tracks = [t for t in self.tracks if not t.is_deleted()]

        # Update distance metric.
        active_targets = [t.track_id for t in self.tracks if t.is_confirmed()]
        features, targets = [], []
        for track in self.tracks:
            if not track.is_confirmed():
                continue
            features += track.features
            targets += [track.track_id for _ in track.features]
            track.features = []
        self.metric.partial_fit(
            np.asarray(features), np.asarray(targets), active_targets)
        output_stracks = self.tracks
        return output_stracks

    def _match(self, detections):
        def gated_metric(tracks, dets, track_indices, detection_indices):
            features = np.array([dets[i].feature for i in detection_indices])
            targets = np.array([tracks[i].track_id for i in track_indices])
            cost_matrix = self.metric.distance(features, targets)
            cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks,
                                           dets, track_indices,
                                           detection_indices)
            return cost_matrix

        # Split track set into confirmed and unconfirmed tracks.
        confirmed_tracks = [
            i for i, t in enumerate(self.tracks) if t.is_confirmed()
        ]
        unconfirmed_tracks = [
            i for i, t in enumerate(self.tracks) if not t.is_confirmed()
        ]

        # Associate confirmed tracks using appearance features.
        matches_a, unmatched_tracks_a, unmatched_detections = \
            matching_cascade(
                gated_metric, self.metric.matching_threshold, self.max_age,
                self.tracks, detections, confirmed_tracks)

        # Associate remaining tracks together with unconfirmed tracks using IOU.
        iou_track_candidates = unconfirmed_tracks + [
            k for k in unmatched_tracks_a
            if self.tracks[k].time_since_update == 1
        ]
        unmatched_tracks_a = [
            k for k in unmatched_tracks_a
            if self.tracks[k].time_since_update != 1
        ]
        matches_b, unmatched_tracks_b, unmatched_detections = \
            min_cost_matching(
                iou_cost, self.max_iou_distance, self.tracks,
                detections, iou_track_candidates, unmatched_detections)

        matches = matches_a + matches_b
        unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b))
        return matches, unmatched_tracks, unmatched_detections

    def _initiate_track(self, detection):
        mean, covariance = self.motion.initiate(detection.to_xyah())
        self.tracks.append(
            Track(mean, covariance, self._next_id, self.n_init, self.max_age,
                  detection.cls_id, detection.score, detection.feature))
        self._next_id += 1


================================================
FILE: ppdet/modeling/mot/tracker/jde_tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
"""

import numpy as np
from collections import defaultdict

from ..matching import jde_matching as matching
from ..motion import KalmanFilter
from .base_jde_tracker import TrackState, STrack
from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = ['JDETracker']


@register
@serializable
class JDETracker(object):
    __shared__ = ['num_classes']
    """
    JDE tracker, support single class and multi classes

    Args:
        use_byte (bool): Whether use ByteTracker, default False
        num_classes (int): the number of classes
        det_thresh (float): threshold of detection score
        track_buffer (int): buffer for tracker
        min_box_area (int): min box area to filter out low quality boxes
        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
            bad results. If set <= 0 means no need to filter bboxes，usually set
            1.6 for pedestrian tracking.
        tracked_thresh (float): linear assignment threshold of tracked 
            stracks and detections
        r_tracked_thresh (float): linear assignment threshold of 
            tracked stracks and unmatched detections
        unconfirmed_thresh (float): linear assignment threshold of 
            unconfirmed stracks and unmatched detections
        conf_thres (float): confidence threshold for tracking, also used in
            ByteTracker as higher confidence threshold
        match_thres (float): linear assignment threshold of tracked 
            stracks and detections in ByteTracker
        low_conf_thres (float): lower confidence threshold for tracking in
            ByteTracker
        input_size (list): input feature map size to reid model, [h, w] format,
            [64, 192] as default.
        motion (str): motion model, KalmanFilter as default
        metric_type (str): either "euclidean" or "cosine", the distance metric 
            used for measurement to track association.
    """

    def __init__(self,
                 use_byte=False,
                 num_classes=1,
                 det_thresh=0.3,
                 track_buffer=30,
                 min_box_area=0,
                 vertical_ratio=0,
                 tracked_thresh=0.7,
                 r_tracked_thresh=0.5,
                 unconfirmed_thresh=0.7,
                 conf_thres=0,
                 match_thres=0.8,
                 low_conf_thres=0.2,
                 input_size=[64, 192],
                 motion='KalmanFilter',
                 metric_type='euclidean'):
        self.use_byte = use_byte
        self.num_classes = num_classes
        self.det_thresh = det_thresh if not use_byte else conf_thres + 0.1
        self.track_buffer = track_buffer
        self.min_box_area = min_box_area
        self.vertical_ratio = vertical_ratio

        self.tracked_thresh = tracked_thresh
        self.r_tracked_thresh = r_tracked_thresh
        self.unconfirmed_thresh = unconfirmed_thresh
        self.conf_thres = conf_thres
        self.match_thres = match_thres
        self.low_conf_thres = low_conf_thres

        self.input_size = input_size
        if motion == 'KalmanFilter':
            self.motion = KalmanFilter()
        self.metric_type = metric_type

        self.frame_id = 0
        self.tracked_tracks_dict = defaultdict(list)  # dict(list[STrack])
        self.lost_tracks_dict = defaultdict(list)  # dict(list[STrack])
        self.removed_tracks_dict = defaultdict(list)  # dict(list[STrack])

        self.max_time_lost = 0
        # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer)

    def update(self, pred_dets, pred_embs=None):
        """
        Processes the image frame and finds bounding box(detections).
        Associates the detection with corresponding tracklets and also handles
            lost, removed, refound and active tracklets.

        Args:
            pred_dets (np.array): Detection results of the image, the shape is
                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
            pred_embs (np.array): Embedding results of the image, the shape is
                [N, 128] or [N, 512].

        Return:
            output_stracks_dict (dict(list)): The list contains information
                regarding the online_tracklets for the received image tensor.
        """
        self.frame_id += 1
        if self.frame_id == 1:
            STrack.init_count(self.num_classes)
        activated_tracks_dict = defaultdict(list)
        refined_tracks_dict = defaultdict(list)
        lost_tracks_dict = defaultdict(list)
        removed_tracks_dict = defaultdict(list)
        output_tracks_dict = defaultdict(list)

        pred_dets_dict = defaultdict(list)
        pred_embs_dict = defaultdict(list)

        # unify single and multi classes detection and embedding results
        for cls_id in range(self.num_classes):
            cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1)
            pred_dets_dict[cls_id] = pred_dets[cls_idx]
            if pred_embs is not None:
                pred_embs_dict[cls_id] = pred_embs[cls_idx]
            else:
                pred_embs_dict[cls_id] = None

        for cls_id in range(self.num_classes):
            """ Step 1: Get detections by class"""
            pred_dets_cls = pred_dets_dict[cls_id]
            pred_embs_cls = pred_embs_dict[cls_id]
            remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1)
            if remain_inds.sum() > 0:
                pred_dets_cls = pred_dets_cls[remain_inds]
                if pred_embs_cls is None:
                    # in original ByteTrack
                    detections = [
                        STrack(
                            STrack.tlbr_to_tlwh(tlbrs[2:6]),
                            tlbrs[1],
                            cls_id,
                            30,
                            temp_feat=None) for tlbrs in pred_dets_cls
                    ]
                else:
                    pred_embs_cls = pred_embs_cls[remain_inds]
                    detections = [
                        STrack(
                            STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id,
                            30, temp_feat) for (tlbrs, temp_feat) in
                        zip(pred_dets_cls, pred_embs_cls)
                    ]
            else:
                detections = []
            ''' Add newly detected tracklets to tracked_stracks'''
            unconfirmed_dict = defaultdict(list)
            tracked_tracks_dict = defaultdict(list)
            for track in self.tracked_tracks_dict[cls_id]:
                if not track.is_activated:
                    # previous tracks which are not active in the current frame are added in unconfirmed list
                    unconfirmed_dict[cls_id].append(track)
                else:
                    # Active tracks are added to the local list 'tracked_stracks'
                    tracked_tracks_dict[cls_id].append(track)
            """ Step 2: First association, with embedding"""
            # building tracking pool for the current frame
            track_pool_dict = defaultdict(list)
            track_pool_dict[cls_id] = joint_stracks(
                tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id])

            # Predict the current location with KalmanFilter
            STrack.multi_predict(track_pool_dict[cls_id], self.motion)

            if pred_embs_cls is None:
                # in original ByteTrack
                dists = matching.iou_distance(track_pool_dict[cls_id],
                                              detections)
                matches, u_track, u_detection = matching.linear_assignment(
                    dists, thresh=self.match_thres)  # not self.tracked_thresh
            else:
                dists = matching.embedding_distance(
                    track_pool_dict[cls_id],
                    detections,
                    metric=self.metric_type)
                dists = matching.fuse_motion(
                    self.motion, dists, track_pool_dict[cls_id], detections)
                matches, u_track, u_detection = matching.linear_assignment(
                    dists, thresh=self.tracked_thresh)

            for i_tracked, idet in matches:
                # i_tracked is the id of the track and idet is the detection
                track = track_pool_dict[cls_id][i_tracked]
                det = detections[idet]
                if track.state == TrackState.Tracked:
                    # If the track is active, add the detection to the track
                    track.update(detections[idet], self.frame_id)
                    activated_tracks_dict[cls_id].append(track)
                else:
                    # We have obtained a detection from a track which is not active,
                    # hence put the track in refind_stracks list
                    track.re_activate(det, self.frame_id, new_id=False)
                    refined_tracks_dict[cls_id].append(track)

            # None of the steps below happen if there are no undetected tracks.
            """ Step 3: Second association, with IOU"""
            if self.use_byte:
                inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres
                inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres
                inds_second = np.logical_and(inds_low, inds_high).squeeze(-1)
                pred_dets_cls_second = pred_dets_dict[cls_id][inds_second]

                # association the untrack to the low score detections
                if len(pred_dets_cls_second) > 0:
                    if pred_embs_dict[cls_id] is None:
                        # in original ByteTrack
                        detections_second = [
                            STrack(
                                STrack.tlbr_to_tlwh(tlbrs[2:6]),
                                tlbrs[1],
                                cls_id,
                                30,
                                temp_feat=None)
                            for tlbrs in pred_dets_cls_second
                        ]
                    else:
                        pred_embs_cls_second = pred_embs_dict[cls_id][
                            inds_second]
                        detections_second = [
                            STrack(
                                STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1],
                                cls_id, 30, temp_feat) for (tlbrs, temp_feat) in
                            zip(pred_dets_cls_second, pred_embs_cls_second)
                        ]
                else:
                    detections_second = []
                r_tracked_stracks = [
                    track_pool_dict[cls_id][i] for i in u_track
                    if track_pool_dict[cls_id][i].state == TrackState.Tracked
                ]
                dists = matching.iou_distance(r_tracked_stracks,
                                              detections_second)
                matches, u_track, u_detection_second = matching.linear_assignment(
                    dists, thresh=0.4)  # not r_tracked_thresh
            else:
                detections = [detections[i] for i in u_detection]
                r_tracked_stracks = []
                for i in u_track:
                    if track_pool_dict[cls_id][i].state == TrackState.Tracked:
                        r_tracked_stracks.append(track_pool_dict[cls_id][i])
                dists = matching.iou_distance(r_tracked_stracks, detections)

                matches, u_track, u_detection = matching.linear_assignment(
                    dists, thresh=self.r_tracked_thresh)

            for i_tracked, idet in matches:
                track = r_tracked_stracks[i_tracked]
                det = detections[
                    idet] if not self.use_byte else detections_second[idet]
                if track.state == TrackState.Tracked:
                    track.update(det, self.frame_id)
                    activated_tracks_dict[cls_id].append(track)
                else:
                    track.re_activate(det, self.frame_id, new_id=False)
                    refined_tracks_dict[cls_id].append(track)

            for it in u_track:
                track = r_tracked_stracks[it]
                if not track.state == TrackState.Lost:
                    track.mark_lost()
                    lost_tracks_dict[cls_id].append(track)
            '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
            detections = [detections[i] for i in u_detection]
            dists = matching.iou_distance(unconfirmed_dict[cls_id], detections)
            matches, u_unconfirmed, u_detection = matching.linear_assignment(
                dists, thresh=self.unconfirmed_thresh)
            for i_tracked, idet in matches:
                unconfirmed_dict[cls_id][i_tracked].update(detections[idet],
                                                           self.frame_id)
                activated_tracks_dict[cls_id].append(unconfirmed_dict[cls_id][
                    i_tracked])
            for it in u_unconfirmed:
                track = unconfirmed_dict[cls_id][it]
                track.mark_removed()
                removed_tracks_dict[cls_id].append(track)
            """ Step 4: Init new stracks"""
            for inew in u_detection:
                track = detections[inew]
                if track.score < self.det_thresh:
                    continue
                track.activate(self.motion, self.frame_id)
                activated_tracks_dict[cls_id].append(track)
            """ Step 5: Update state"""
            for track in self.lost_tracks_dict[cls_id]:
                if self.frame_id - track.end_frame > self.max_time_lost:
                    track.mark_removed()
                    removed_tracks_dict[cls_id].append(track)

            self.tracked_tracks_dict[cls_id] = [
                t for t in self.tracked_tracks_dict[cls_id]
                if t.state == TrackState.Tracked
            ]
            self.tracked_tracks_dict[cls_id] = joint_stracks(
                self.tracked_tracks_dict[cls_id], activated_tracks_dict[cls_id])
            self.tracked_tracks_dict[cls_id] = joint_stracks(
                self.tracked_tracks_dict[cls_id], refined_tracks_dict[cls_id])
            self.lost_tracks_dict[cls_id] = sub_stracks(
                self.lost_tracks_dict[cls_id], self.tracked_tracks_dict[cls_id])
            self.lost_tracks_dict[cls_id].extend(lost_tracks_dict[cls_id])
            self.lost_tracks_dict[cls_id] = sub_stracks(
                self.lost_tracks_dict[cls_id], self.removed_tracks_dict[cls_id])
            self.removed_tracks_dict[cls_id].extend(removed_tracks_dict[cls_id])
            self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[
                cls_id] = remove_duplicate_stracks(
                    self.tracked_tracks_dict[cls_id],
                    self.lost_tracks_dict[cls_id])

            # get scores of lost tracks
            output_tracks_dict[cls_id] = [
                track for track in self.tracked_tracks_dict[cls_id]
                if track.is_activated
            ]

            logger.debug('===========Frame {}=========='.format(self.frame_id))
            logger.debug('Activated: {}'.format(
                [track.track_id for track in activated_tracks_dict[cls_id]]))
            logger.debug('Refind: {}'.format(
                [track.track_id for track in refined_tracks_dict[cls_id]]))
            logger.debug('Lost: {}'.format(
                [track.track_id for track in lost_tracks_dict[cls_id]]))
            logger.debug('Removed: {}'.format(
                [track.track_id for track in removed_tracks_dict[cls_id]]))

        return output_tracks_dict


================================================
FILE: ppdet/modeling/mot/tracker/ocsort_tracker.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py
"""

import numpy as np
from ..matching.ocsort_matching import associate, linear_assignment, iou_batch, associate_only_iou
from ..motion.ocsort_kalman_filter import OCSORTKalmanFilter
from ppdet.core.workspace import register, serializable


def k_previous_obs(observations, cur_age, k):
    if len(observations) == 0:
        return [-1, -1, -1, -1, -1]
    for i in range(k):
        dt = k - i
        if cur_age - dt in observations:
            return observations[cur_age - dt]
    max_age = max(observations.keys())
    return observations[max_age]


def convert_bbox_to_z(bbox):
    """
    Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
      [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
      the aspect ratio
    """
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]
    x = bbox[0] + w / 2.
    y = bbox[1] + h / 2.
    s = w * h  # scale is just area
    r = w / float(h + 1e-6)
    return np.array([x, y, s, r]).reshape((4, 1))


def convert_x_to_bbox(x, score=None):
    """
    Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
      [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
    """
    w = np.sqrt(x[2] * x[3])
    h = x[2] / w
    if (score == None):
        return np.array(
            [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2.,
             x[1] + h / 2.]).reshape((1, 4))
    else:
        score = np.array([score])
        return np.array([
            x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score
        ]).reshape((1, 5))


def speed_direction(bbox1, bbox2):
    cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
    cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
    speed = np.array([cy2 - cy1, cx2 - cx1])
    norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6
    return speed / norm


class KalmanBoxTracker(object):
    """
    This class represents the internal state of individual tracked objects observed as bbox.

    Args:
        bbox (np.array): bbox in [x1,y1,x2,y2,score] format.
        delta_t (int): delta_t of previous observation
    """
    count = 0

    def __init__(self, bbox, delta_t=3):

        self.kf = OCSORTKalmanFilter(dim_x=7, dim_z=4)
        self.kf.F = np.array([[1., 0, 0, 0, 1., 0, 0], [0, 1., 0, 0, 0, 1., 0],
                              [0, 0, 1., 0, 0, 0, 1], [0, 0, 0, 1., 0, 0, 0],
                              [0, 0, 0, 0, 1., 0, 0], [0, 0, 0, 0, 0, 1., 0],
                              [0, 0, 0, 0, 0, 0, 1.]])
        self.kf.H = np.array([[1., 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0],
                              [0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0]])
        self.kf.R[2:, 2:] *= 10.
        self.kf.P[4:, 4:] *= 1000.
        # give high uncertainty to the unobservable initial velocities
        self.kf.P *= 10.
        self.kf.Q[-1, -1] *= 0.01
        self.kf.Q[4:, 4:] *= 0.01

        self.score = bbox[4]
        self.kf.x[:4] = convert_bbox_to_z(bbox)
        self.time_since_update = 0
        self.id = KalmanBoxTracker.count
        KalmanBoxTracker.count += 1
        self.history = []
        self.hits = 0
        self.hit_streak = 0
        self.age = 0
        """
        NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of 
        function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a 
        fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now.
        """
        self.last_observation = np.array([-1, -1, -1, -1, -1])  # placeholder
        self.observations = dict()
        self.history_observations = []
        self.velocity = None
        self.delta_t = delta_t

    def update(self, bbox, angle_cost=False):
        """
        Updates the state vector with observed bbox.
        """
        if bbox is not None:
            if angle_cost and self.last_observation.sum(
            ) >= 0:  # no previous observation
                previous_box = None
                for i in range(self.delta_t):
                    dt = self.delta_t - i
                    if self.age - dt in self.observations:
                        previous_box = self.observations[self.age - dt]
                        break
                if previous_box is None:
                    previous_box = self.last_observation
                """
                  Estimate the track speed direction with observations \Delta t steps away
                """
                self.velocity = speed_direction(previous_box, bbox)
            """
              Insert new observations. This is a ugly way to maintain both self.observations
              and self.history_observations. Bear it for the moment.
            """
            self.last_observation = bbox
            self.observations[self.age] = bbox
            self.history_observations.append(bbox)

            self.time_since_update = 0
            self.history = []
            self.hits += 1
            self.hit_streak += 1
            self.kf.update(convert_bbox_to_z(bbox))
        else:
            self.kf.update(bbox)

    def predict(self):
        """
        Advances the state vector and returns the predicted bounding box estimate.
        """
        if ((self.kf.x[6] + self.kf.x[2]) <= 0):
            self.kf.x[6] *= 0.0

        self.kf.predict()
        self.age += 1
        if (self.time_since_update > 0):
            self.hit_streak = 0
        self.time_since_update += 1
        self.history.append(convert_x_to_bbox(self.kf.x, score=self.score))
        return self.history[-1]

    def get_state(self):
        return convert_x_to_bbox(self.kf.x, score=self.score)


@register
@serializable
class OCSORTTracker(object):
    """
    OCSORT tracker, support single class

    Args:
        det_thresh (float): threshold of detection score
        max_age (int): maximum number of missed misses before a track is deleted
        min_hits (int): minimum hits for associate
        iou_threshold (float): iou threshold for associate
        delta_t (int): delta_t of previous observation
        inertia (float): vdc_weight of angle_diff_cost for associate
        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
            bad results. If set <= 0 means no need to filter bboxes，usually set
            1.6 for pedestrian tracking.
        min_box_area (int): min box area to filter out low quality boxes
        use_byte (bool): Whether use ByteTracker, default False
    """

    def __init__(self,
                 det_thresh=0.6,
                 max_age=30,
                 min_hits=3,
                 iou_threshold=0.3,
                 delta_t=3,
                 inertia=0.2,
                 vertical_ratio=-1,
                 min_box_area=0,
                 use_byte=False,
                 use_angle_cost=False):
        self.det_thresh = det_thresh
        self.max_age = max_age
        self.min_hits = min_hits
        self.iou_threshold = iou_threshold
        self.delta_t = delta_t
        self.inertia = inertia
        self.vertical_ratio = vertical_ratio
        self.min_box_area = min_box_area
        self.use_byte = use_byte
        self.use_angle_cost = use_angle_cost

        self.trackers = []
        self.frame_count = 0
        KalmanBoxTracker.count = 0

    def update(self, pred_dets, pred_embs=None):
        """
        Args:
            pred_dets (np.array): Detection results of the image, the shape is
                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
            pred_embs (np.array): Embedding results of the image, the shape is
                [N, 128] or [N, 512], default as None.

        Return:
            tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'.
        """
        if pred_dets is None:
            return np.empty((0, 6))

        self.frame_count += 1

        bboxes = pred_dets[:, 2:]
        scores = pred_dets[:, 1:2]
        dets = np.concatenate((bboxes, scores), axis=1)
        scores = scores.squeeze(-1)

        inds_low = scores > 0.1
        inds_high = scores < self.det_thresh
        inds_second = np.logical_and(inds_low, inds_high)
        # self.det_thresh > score > 0.1, for second matching
        dets_second = dets[inds_second]  # detections for second matching
        remain_inds = scores > self.det_thresh
        dets = dets[remain_inds]

        # get predicted locations from existing trackers.
        trks = np.zeros((len(self.trackers), 5))
        to_del = []
        ret = []
        for t, trk in enumerate(trks):
            pos = self.trackers[t].predict()[0]
            trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
            if np.any(np.isnan(pos)):
                to_del.append(t)
        trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
        for t in reversed(to_del):
            self.trackers.pop(t)

        if self.use_angle_cost:
            velocities = np.array([
                trk.velocity if trk.velocity is not None else np.array((0, 0))
                for trk in self.trackers
            ])

            k_observations = np.array([
                k_previous_obs(trk.observations, trk.age, self.delta_t)
                for trk in self.trackers
            ])
        last_boxes = np.array([trk.last_observation for trk in self.trackers])
        """
            First round of association
        """
        if self.use_angle_cost:
            matched, unmatched_dets, unmatched_trks = associate(
                dets, trks, self.iou_threshold, velocities, k_observations,
                self.inertia)
        else:
            matched, unmatched_dets, unmatched_trks = associate_only_iou(
                dets, trks, self.iou_threshold)

        for m in matched:
            self.trackers[m[1]].update(
                dets[m[0], :], angle_cost=self.use_angle_cost)
        """
            Second round of associaton by OCR
        """
        # BYTE association
        if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[
                0] > 0:
            u_trks = trks[unmatched_trks]
            iou_left = iou_batch(
                dets_second,
                u_trks)  # iou between low score detections and unmatched tracks
            iou_left = np.array(iou_left)
            if iou_left.max() > self.iou_threshold:
                """
                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may
                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it
                    uniform here for simplicity
                """
                matched_indices = linear_assignment(-iou_left)
                to_remove_trk_indices = []
                for m in matched_indices:
                    det_ind, trk_ind = m[0], unmatched_trks[m[1]]
                    if iou_left[m[0], m[1]] < self.iou_threshold:
                        continue
                    self.trackers[trk_ind].update(
                        dets_second[det_ind, :], angle_cost=self.use_angle_cost)
                    to_remove_trk_indices.append(trk_ind)
                unmatched_trks = np.setdiff1d(unmatched_trks,
                                              np.array(to_remove_trk_indices))

        if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0:
            left_dets = dets[unmatched_dets]
            left_trks = last_boxes[unmatched_trks]
            iou_left = iou_batch(left_dets, left_trks)
            iou_left = np.array(iou_left)
            if iou_left.max() > self.iou_threshold:
                """
                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may
                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it
                    uniform here for simplicity
                """
                rematched_indices = linear_assignment(-iou_left)
                to_remove_det_indices = []
                to_remove_trk_indices = []
                for m in rematched_indices:
                    det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[
                        1]]
                    if iou_left[m[0], m[1]] < self.iou_threshold:
                        continue
                    self.trackers[trk_ind].update(
                        dets[det_ind, :], angle_cost=self.use_angle_cost)
                    to_remove_det_indices.append(det_ind)
                    to_remove_trk_indices.append(trk_ind)
                unmatched_dets = np.setdiff1d(unmatched_dets,
                                              np.array(to_remove_det_indices))
                unmatched_trks = np.setdiff1d(unmatched_trks,
                                              np.array(to_remove_trk_indices))

        for m in unmatched_trks:
            self.trackers[m].update(None)

        # create and initialise new trackers for unmatched detections
        for i in unmatched_dets:
            trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t)
            self.trackers.append(trk)

        i = len(self.trackers)
        for trk in reversed(self.trackers):
            if trk.last_observation.sum() < 0:
                d = trk.get_state()[0]
            else:
                d = trk.last_observation  # tlbr + score
            if (trk.time_since_update < 1) and (
                    trk.hit_streak >= self.min_hits or
                    self.frame_count <= self.min_hits):
                # +1 as MOT benchmark requires positive
                ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1))
            i -= 1
            # remove dead tracklet
            if (trk.time_since_update > self.max_age):
                self.trackers.pop(i)
        if (len(ret) > 0):
            return np.concatenate(ret)
        return np.empty((0, 6))


================================================
FILE: ppdet/modeling/mot/utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import cv2
import time
import numpy as np
from .visualization import plot_tracking_dict, plot_tracking

__all__ = [
    'MOTTimer',
    'Detection',
    'write_mot_results',
    'save_vis_results',
    'load_det_results',
    'preprocess_reid',
    'get_crops',
    'clip_box',
    'scale_coords',
]


class MOTTimer(object):
    """
    This class used to compute and print the current FPS while evaling.
    """

    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
        self.duration = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            self.duration = self.average_time
        else:
            self.duration = self.diff
        return self.duration

    def clear(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
        self.duration = 0.


class Detection(object):
    """
    This class represents a bounding box detection in a single image.

    Args:
        tlwh (Tensor): Bounding box in format `(top left x, top left y,
            width, height)`.
        score (Tensor): Bounding box confidence score.
        feature (Tensor): A feature vector that describes the object 
            contained in this image.
        cls_id (Tensor): Bounding box category id.
    """

    def __init__(self, tlwh, score, feature, cls_id):
        self.tlwh = np.asarray(tlwh, dtype=np.float32)
        self.score = float(score)
        self.feature = np.asarray(feature, dtype=np.float32)
        self.cls_id = int(cls_id)

    def to_tlbr(self):
        """
        Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
        `(top left, bottom right)`.
        """
        ret = self.tlwh.copy()
        ret[2:] += ret[:2]
        return ret

    def to_xyah(self):
        """
        Convert bounding box to format `(center x, center y, aspect ratio,
        height)`, where the aspect ratio is `width / height`.
        """
        ret = self.tlwh.copy()
        ret[:2] += ret[2:] / 2
        ret[2] /= ret[3]
        return ret


def write_mot_results(filename, results, data_type='mot', num_classes=1):
    # support single and multi classes
    if data_type in ['mot', 'mcmot']:
        save_format = '{frame},{id},{x1},{y1},{w},{h},{score},{cls_id},-1,-1\n'
    elif data_type == 'kitti':
        save_format = '{frame} {id} car 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n'
    else:
        raise ValueError(data_type)

    f = open(filename, 'w')
    for cls_id in range(num_classes):
        for frame_id, tlwhs, tscores, track_ids in results[cls_id]:
            if data_type == 'kitti':
                frame_id -= 1
            for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):
                if track_id < 0: continue
                if data_type == 'mot':
                    cls_id = -1

                x1, y1, w, h = tlwh
                x2, y2 = x1 + w, y1 + h
                line = save_format.format(
                    frame=frame_id,
                    id=track_id,
                    x1=x1,
                    y1=y1,
                    x2=x2,
                    y2=y2,
                    w=w,
                    h=h,
                    score=score,
                    cls_id=cls_id)
                f.write(line)
    print('MOT results save in {}'.format(filename))


def save_vis_results(data,
                     frame_id,
                     online_ids,
                     online_tlwhs,
                     online_scores,
                     average_time,
                     show_image,
                     save_dir,
                     num_classes=1,
                     ids2names=[]):
    if show_image or save_dir is not None:
        assert 'ori_image' in data
        img0 = data['ori_image'].numpy()[0]
        if online_ids is None:
            online_im = img0
        else:
            if isinstance(online_tlwhs, dict):
                online_im = plot_tracking_dict(
                    img0,
                    num_classes,
                    online_tlwhs,
                    online_ids,
                    online_scores,
                    frame_id=frame_id,
                    fps=1. / average_time,
                    ids2names=ids2names)
            else:
                online_im = plot_tracking(
                    img0,
                    online_tlwhs,
                    online_ids,
                    online_scores,
                    frame_id=frame_id,
                    fps=1. / average_time,
                    ids2names=ids2names)
    if show_image:
        cv2.imshow('online_im', online_im)
    if save_dir is not None:
        cv2.imwrite(
            os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im)


def load_det_results(det_file, num_frames):
    assert os.path.exists(det_file) and os.path.isfile(det_file), \
        '{} is not exist or not a file.'.format(det_file)
    labels = np.loadtxt(det_file, dtype='float32', delimiter=',')
    assert labels.shape[1] == 7, \
        "Each line of {} should have 7 items: '[frame_id],[x0],[y0],[w],[h],[score],[class_id]'.".format(det_file)
    results_list = []
    for frame_i in range(num_frames):
        results = {'bbox': [], 'score': [], 'cls_id': []}
        lables_with_frame = labels[labels[:, 0] == frame_i + 1]
        # each line of lables_with_frame:
        # [frame_id],[x0],[y0],[w],[h],[score],[class_id]
        for l in lables_with_frame:
            results['bbox'].append(l[1:5])
            results['score'].append(l[5:6])
            results['cls_id'].append(l[6:7])
        results_list.append(results)
    return results_list


def scale_coords(coords, input_shape, im_shape, scale_factor):
    # Note: ratio has only one value, scale_factor[0] == scale_factor[1]
    # 
    # This function only used for JDE YOLOv3 or other detectors with 
    # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had
    # not scaled back to the origin image.

    ratio = scale_factor[0]
    pad_w = (input_shape[1] - int(im_shape[1])) / 2
    pad_h = (input_shape[0] - int(im_shape[0])) / 2
    coords[:, 0::2] -= pad_w
    coords[:, 1::2] -= pad_h
    coords[:, 0:4] /= ratio
    coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max())
    return coords.round()


def clip_box(xyxy, ori_image_shape):
    H, W = ori_image_shape
    xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=W)
    xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=H)
    w = xyxy[:, 2:3] - xyxy[:, 0:1]
    h = xyxy[:, 3:4] - xyxy[:, 1:2]
    mask = np.logical_and(h > 0, w > 0)
    keep_idx = np.nonzero(mask)
    return xyxy[keep_idx[0]], keep_idx


def get_crops(xyxy, ori_img, w, h):
    crops = []
    xyxy = xyxy.astype(np.int64)
    ori_img = ori_img.numpy()
    ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2)  # [h,w,3]->[w,h,3]
    for i, bbox in enumerate(xyxy):
        crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :]
        crops.append(crop)
    crops = preprocess_reid(crops, w, h)
    return crops


def preprocess_reid(imgs,
                    w=64,
                    h=192,
                    mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]):
    im_batch = []
    for img in imgs:
        img = cv2.resize(img, (w, h))
        img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
        img_mean = np.array(mean).reshape((3, 1, 1))
        img_std = np.array(std).reshape((3, 1, 1))
        img -= img_mean
        img /= img_std
        img = np.expand_dims(img, axis=0)
        im_batch.append(img)
    im_batch = np.concatenate(im_batch, 0)
    return im_batch


================================================
FILE: ppdet/modeling/mot/visualization.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cv2
import numpy as np


def get_color(idx):
    idx = idx * 3
    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
    return color


def plot_tracking(image,
                  tlwhs,
                  obj_ids,
                  scores=None,
                  frame_id=0,
                  fps=0.,
                  ids2names=[]):
    im = np.ascontiguousarray(np.copy(image))
    im_h, im_w = im.shape[:2]

    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255

    text_scale = max(1, image.shape[1] / 1600.)
    text_thickness = 2
    line_thickness = max(1, int(image.shape[1] / 500.))

    radius = max(5, int(im_w / 140.))
    cv2.putText(
        im,
        'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
        (0, int(15 * text_scale)),
        cv2.FONT_HERSHEY_PLAIN,
        text_scale, (0, 0, 255),
        thickness=2)

    for i, tlwh in enumerate(tlwhs):
        x1, y1, w, h = tlwh
        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
        obj_id = int(obj_ids[i])
        id_text = '{}'.format(int(obj_id))
        if ids2names != []:
            assert len(
                ids2names) == 1, "plot_tracking only supports single classes."
            id_text = '{}_'.format(ids2names[0]) + id_text
        _line_thickness = 1 if obj_id <= 0 else line_thickness
        color = get_color(abs(obj_id))
        cv2.rectangle(
            im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
        cv2.putText(
            im,
            id_text, (intbox[0], intbox[1] - 10),
            cv2.FONT_HERSHEY_PLAIN,
            text_scale, (0, 0, 255),
            thickness=text_thickness)

        if scores is not None:
            text = '{:.2f}'.format(float(scores[i]))
            cv2.putText(
                im,
                text, (intbox[0], intbox[1] + 10),
                cv2.FONT_HERSHEY_PLAIN,
                text_scale, (0, 255, 255),
                thickness=text_thickness)
    return im


def plot_tracking_dict(image,
                       num_classes,
                       tlwhs_dict,
                       obj_ids_dict,
                       scores_dict,
                       frame_id=0,
                       fps=0.,
                       ids2names=[]):
    im = np.ascontiguousarray(np.copy(image))
    im_h, im_w = im.shape[:2]

    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255

    text_scale = max(1, image.shape[1] / 1600.)
    text_thickness = 2
    line_thickness = max(1, int(image.shape[1] / 500.))

    radius = max(5, int(im_w / 140.))

    for cls_id in range(num_classes):
        tlwhs = tlwhs_dict[cls_id]
        obj_ids = obj_ids_dict[cls_id]
        scores = scores_dict[cls_id]
        cv2.putText(
            im,
            'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
            (0, int(15 * text_scale)),
            cv2.FONT_HERSHEY_PLAIN,
            text_scale, (0, 0, 255),
            thickness=2)

        for i, tlwh in enumerate(tlwhs):
            x1, y1, w, h = tlwh
            intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
            obj_id = int(obj_ids[i])

            id_text = '{}'.format(int(obj_id))
            if ids2names != []:
                id_text = '{}_{}'.format(ids2names[cls_id], id_text)
            else:
                id_text = 'class{}_{}'.format(cls_id, id_text)

            _line_thickness = 1 if obj_id <= 0 else line_thickness
            color = get_color(abs(obj_id))
            cv2.rectangle(
                im,
                intbox[0:2],
                intbox[2:4],
                color=color,
                thickness=line_thickness)
            cv2.putText(
                im,
                id_text, (intbox[0], intbox[1] - 10),
                cv2.FONT_HERSHEY_PLAIN,
                text_scale, (0, 0, 255),
                thickness=text_thickness)

            if scores is not None:
                text = '{:.2f}'.format(float(scores[i]))
                cv2.putText(
                    im,
                    text, (intbox[0], intbox[1] + 10),
                    cv2.FONT_HERSHEY_PLAIN,
                    text_scale, (0, 255, 255),
                    thickness=text_thickness)
    return im


================================================
FILE: ppdet/modeling/necks/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import fpn
from . import yolo_fpn
from . import hrfpn
from . import ttf_fpn
from . import centernet_fpn
from . import bifpn
from . import csp_pan
from . import es_pan
from . import lc_pan
from . import custom_pan
from . import dilated_encoder
from . import clrnet_fpn

from .fpn import *
from .yolo_fpn import *
from .hrfpn import *
from .ttf_fpn import *
from .centernet_fpn import *
from .blazeface_fpn import *
from .bifpn import *
from .csp_pan import *
from .es_pan import *
from .lc_pan import *
from .custom_pan import *
from .dilated_encoder import *
from .channel_mapper import *
from .clrnet_fpn import *


================================================
FILE: ppdet/modeling/necks/bifpn.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant

from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ..shape_spec import ShapeSpec

__all__ = ['BiFPN']


class SeparableConvLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels=None,
                 kernel_size=3,
                 norm_type='bn',
                 norm_groups=32,
                 act='swish'):
        super(SeparableConvLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'gn', None]
        assert act in ['swish', 'relu', None]

        self.in_channels = in_channels
        if out_channels is None:
            self.out_channels = self.in_channels
        self.norm_type = norm_type
        self.norm_groups = norm_groups
        self.depthwise_conv = nn.Conv2D(
            in_channels,
            in_channels,
            kernel_size,
            padding=kernel_size // 2,
            groups=in_channels,
            bias_attr=False)
        self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1)

        # norm type
        if self.norm_type in ['bn', 'sync_bn']:
            self.norm = nn.BatchNorm2D(self.out_channels)
        elif self.norm_type == 'gn':
            self.norm = nn.GroupNorm(
                num_groups=self.norm_groups, num_channels=self.out_channels)

        # activation
        if act == 'swish':
            self.act = nn.Swish()
        elif act == 'relu':
            self.act = nn.ReLU()

    def forward(self, x):
        if self.act is not None:
            x = self.act(x)
        out = self.depthwise_conv(x)
        out = self.pointwise_conv(out)
        if self.norm_type is not None:
            out = self.norm(out)
        return out


class BiFPNCell(nn.Layer):
    def __init__(self,
                 channels=256,
                 num_levels=5,
                 eps=1e-5,
                 use_weighted_fusion=True,
                 kernel_size=3,
                 norm_type='bn',
                 norm_groups=32,
                 act='swish'):
        super(BiFPNCell, self).__init__()
        self.channels = channels
        self.num_levels = num_levels
        self.eps = eps
        self.use_weighted_fusion = use_weighted_fusion

        # up
        self.conv_up = nn.LayerList([
            SeparableConvLayer(
                self.channels,
                kernel_size=kernel_size,
                norm_type=norm_type,
                norm_groups=norm_groups,
                act=act) for _ in range(self.num_levels - 1)
        ])
        # down
        self.conv_down = nn.LayerList([
            SeparableConvLayer(
                self.channels,
                kernel_size=kernel_size,
                norm_type=norm_type,
                norm_groups=norm_groups,
                act=act) for _ in range(self.num_levels - 1)
        ])

        if self.use_weighted_fusion:
            self.up_weights = self.create_parameter(
                shape=[self.num_levels - 1, 2],
                attr=ParamAttr(initializer=Constant(1.)))
            self.down_weights = self.create_parameter(
                shape=[self.num_levels - 1, 3],
                attr=ParamAttr(initializer=Constant(1.)))

    def _feature_fusion_cell(self,
                             conv_layer,
                             lateral_feat,
                             sampling_feat,
                             route_feat=None,
                             weights=None):
        if self.use_weighted_fusion:
            weights = F.relu(weights)
            weights = weights / (weights.sum() + self.eps)
            if route_feat is not None:
                out_feat = weights[0] * lateral_feat + \
                           weights[1] * sampling_feat + \
                           weights[2] * route_feat
            else:
                out_feat = weights[0] * lateral_feat + \
                           weights[1] * sampling_feat
        else:
            if route_feat is not None:
                out_feat = lateral_feat + sampling_feat + route_feat
            else:
                out_feat = lateral_feat + sampling_feat

        out_feat = conv_layer(out_feat)
        return out_feat

    def forward(self, feats):
        # feats: [P3 - P7]
        lateral_feats = []

        # up
        up_feature = feats[-1]
        for i, feature in enumerate(feats[::-1]):
            if i == 0:
                lateral_feats.append(feature)
            else:
                shape = feature.shape
                up_feature = F.interpolate(
                    up_feature, size=[shape[2], shape[3]])
                lateral_feature = self._feature_fusion_cell(
                    self.conv_up[i - 1],
                    feature,
                    up_feature,
                    weights=self.up_weights[i - 1]
                    if self.use_weighted_fusion else None)
                lateral_feats.append(lateral_feature)
                up_feature = lateral_feature

        out_feats = []
        # down
        down_feature = lateral_feats[-1]
        for i, (lateral_feature,
                route_feature) in enumerate(zip(lateral_feats[::-1], feats)):
            if i == 0:
                out_feats.append(lateral_feature)
            else:
                down_feature = F.max_pool2d(down_feature, 3, 2, 1)
                if i == len(feats) - 1:
                    route_feature = None
                    weights = self.down_weights[
                        i - 1][:2] if self.use_weighted_fusion else None
                else:
                    weights = self.down_weights[
                        i - 1] if self.use_weighted_fusion else None
                out_feature = self._feature_fusion_cell(
                    self.conv_down[i - 1],
                    lateral_feature,
                    down_feature,
                    route_feature,
                    weights=weights)
                out_feats.append(out_feature)
                down_feature = out_feature

        return out_feats


@register
@serializable
class BiFPN(nn.Layer):
    """
    Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070

    Args:
        in_channels (list[int]): input channels of each level which can be
            derived from the output shape of backbone by from_config.
        out_channel (int): output channel of each level.
        num_extra_levels (int): the number of extra stages added to the last level.
            default: 2
        fpn_strides (List): The stride of each level.
        num_stacks (int): the number of stacks for BiFPN, default: 1.
        use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True.
        norm_type (string|None): the normalization type in BiFPN module. If
            norm_type is None, norm will not be used after conv and if
            norm_type is string, bn, gn, sync_bn are available. default: bn.
        norm_groups (int): if you use gn, set this param.
        act (string|None): the activation function of BiFPN.
    """

    def __init__(self,
                 in_channels=(512, 1024, 2048),
                 out_channel=256,
                 num_extra_levels=2,
                 fpn_strides=[8, 16, 32, 64, 128],
                 num_stacks=1,
                 use_weighted_fusion=True,
                 norm_type='bn',
                 norm_groups=32,
                 act='swish'):
        super(BiFPN, self).__init__()
        assert num_stacks > 0, "The number of stacks of BiFPN is at least 1."
        assert norm_type in ['bn', 'sync_bn', 'gn', None]
        assert act in ['swish', 'relu', None]
        assert num_extra_levels >= 0, \
            "The `num_extra_levels` must be non negative(>=0)."

        self.in_channels = in_channels
        self.out_channel = out_channel
        self.num_extra_levels = num_extra_levels
        self.num_stacks = num_stacks
        self.use_weighted_fusion = use_weighted_fusion
        self.norm_type = norm_type
        self.norm_groups = norm_groups
        self.act = act
        self.num_levels = len(self.in_channels) + self.num_extra_levels
        if len(fpn_strides) != self.num_levels:
            for i in range(self.num_extra_levels):
                fpn_strides += [fpn_strides[-1] * 2]
        self.fpn_strides = fpn_strides

        self.lateral_convs = nn.LayerList()
        for in_c in in_channels:
            self.lateral_convs.append(
                ConvNormLayer(in_c, self.out_channel, 1, 1))
        if self.num_extra_levels > 0:
            self.extra_convs = nn.LayerList()
            for i in range(self.num_extra_levels):
                if i == 0:
                    self.extra_convs.append(
                        ConvNormLayer(self.in_channels[-1], self.out_channel, 3,
                                      2))
                else:
                    self.extra_convs.append(nn.MaxPool2D(3, 2, 1))

        self.bifpn_cells = nn.LayerList()
        for i in range(self.num_stacks):
            self.bifpn_cells.append(
                BiFPNCell(
                    self.out_channel,
                    self.num_levels,
                    use_weighted_fusion=self.use_weighted_fusion,
                    norm_type=self.norm_type,
                    norm_groups=self.norm_groups,
                    act=self.act))

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
            'fpn_strides': [i.stride for i in input_shape]
        }

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channel, stride=s) for s in self.fpn_strides
        ]

    def forward(self, feats):
        assert len(feats) == len(self.in_channels)
        fpn_feats = []
        for conv_layer, feature in zip(self.lateral_convs, feats):
            fpn_feats.append(conv_layer(feature))
        if self.num_extra_levels > 0:
            feat = feats[-1]
            for conv_layer in self.extra_convs:
                feat = conv_layer(feat)
                fpn_feats.append(feat)

        for bifpn_cell in self.bifpn_cells:
            fpn_feats = bifpn_cell(fpn_feats)
        return fpn_feats


================================================
FILE: ppdet/modeling/necks/blazeface_fpn.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn.functional as F
from paddle import ParamAttr
import paddle.nn as nn
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['BlazeNeck']


def hard_swish(x):
    return x * F.relu6(x + 3) / 6.


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 num_groups=1,
                 act='relu',
                 conv_lr=0.1,
                 conv_decay=0.,
                 norm_decay=0.,
                 norm_type='bn',
                 name=None):
        super(ConvBNLayer, self).__init__()
        self.act = act
        self._conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=num_groups,
            weight_attr=ParamAttr(
                learning_rate=conv_lr, initializer=KaimingNormal()),
            bias_attr=False)

        if norm_type in ['sync_bn', 'bn']:
            self._batch_norm = nn.BatchNorm2D(out_channels)

    def forward(self, x):
        x = self._conv(x)
        x = self._batch_norm(x)
        if self.act == "relu":
            x = F.relu(x)
        elif self.act == "relu6":
            x = F.relu6(x)
        elif self.act == 'leaky':
            x = F.leaky_relu(x)
        elif self.act == 'hard_swish':
            x = hard_swish(x)
        return x


class FPN(nn.Layer):
    def __init__(self, in_channels, out_channels, name=None):
        super(FPN, self).__init__()
        self.conv1_fpn = ConvBNLayer(
            in_channels,
            out_channels // 2,
            kernel_size=1,
            padding=0,
            stride=1,
            act='leaky',
            name=name + '_output1')
        self.conv2_fpn = ConvBNLayer(
            in_channels,
            out_channels // 2,
            kernel_size=1,
            padding=0,
            stride=1,
            act='leaky',
            name=name + '_output2')
        self.conv3_fpn = ConvBNLayer(
            out_channels // 2,
            out_channels // 2,
            kernel_size=3,
            padding=1,
            stride=1,
            act='leaky',
            name=name + '_merge')

    def forward(self, input):
        output1 = self.conv1_fpn(input[0])
        output2 = self.conv2_fpn(input[1])
        up2 = F.upsample(
            output2, size=output1.shape[-2:], mode='nearest')
        output1 = paddle.add(output1, up2)
        output1 = self.conv3_fpn(output1)
        return output1, output2


class SSH(nn.Layer):
    def __init__(self, in_channels, out_channels, name=None):
        super(SSH, self).__init__()
        assert out_channels % 4 == 0
        self.conv0_ssh = ConvBNLayer(
            in_channels,
            out_channels // 2,
            kernel_size=3,
            padding=1,
            stride=1,
            act=None,
            name=name + 'ssh_conv3')
        self.conv1_ssh = ConvBNLayer(
            out_channels // 2,
            out_channels // 4,
            kernel_size=3,
            padding=1,
            stride=1,
            act='leaky',
            name=name + 'ssh_conv5_1')
        self.conv2_ssh = ConvBNLayer(
            out_channels // 4,
            out_channels // 4,
            kernel_size=3,
            padding=1,
            stride=1,
            act=None,
            name=name + 'ssh_conv5_2')
        self.conv3_ssh = ConvBNLayer(
            out_channels // 4,
            out_channels // 4,
            kernel_size=3,
            padding=1,
            stride=1,
            act='leaky',
            name=name + 'ssh_conv7_1')
        self.conv4_ssh = ConvBNLayer(
            out_channels // 4,
            out_channels // 4,
            kernel_size=3,
            padding=1,
            stride=1,
            act=None,
            name=name + 'ssh_conv7_2')

    def forward(self, x):
        conv0 = self.conv0_ssh(x)
        conv1 = self.conv1_ssh(conv0)
        conv2 = self.conv2_ssh(conv1)
        conv3 = self.conv3_ssh(conv2)
        conv4 = self.conv4_ssh(conv3)
        concat = paddle.concat([conv0, conv2, conv4], axis=1)
        return F.relu(concat)


@register
@serializable
class BlazeNeck(nn.Layer):
    def __init__(self, in_channel, neck_type="None", data_format='NCHW'):
        super(BlazeNeck, self).__init__()
        self.neck_type = neck_type
        self.reture_input = False
        self._out_channels = in_channel
        if self.neck_type == 'None':
            self.reture_input = True
        if "fpn" in self.neck_type:
            self.fpn = FPN(self._out_channels[0],
                           self._out_channels[1],
                           name='fpn')
            self._out_channels = [
                self._out_channels[0] // 2, self._out_channels[1] // 2
            ]
        if "ssh" in self.neck_type:
            self.ssh1 = SSH(self._out_channels[0],
                            self._out_channels[0],
                            name='ssh1')
            self.ssh2 = SSH(self._out_channels[1],
                            self._out_channels[1],
                            name='ssh2')
            self._out_channels = [self._out_channels[0], self._out_channels[1]]

    def forward(self, inputs):
        if self.reture_input:
            return inputs
        output1, output2 = None, None
        if "fpn" in self.neck_type:
            backout_4, backout_1 = inputs
            output1, output2 = self.fpn([backout_4, backout_1])
        if self.neck_type == "only_fpn":
            return [output1, output2]
        if self.neck_type == "only_ssh":
            output1, output2 = inputs
        feature1 = self.ssh1(output1)
        feature2 = self.ssh2(output2)
        return [feature1, feature2]

    @property
    def out_shape(self):
        return [
            ShapeSpec(channels=c)
            for c in [self._out_channels[0], self._out_channels[1]]
        ]


================================================
FILE: ppdet/modeling/necks/centernet_fpn.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import math
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn.initializer import Uniform
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock
from ..shape_spec import ShapeSpec

__all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN']


# SGE attention
class BasicConv(nn.Layer):
    def __init__(self,
                 in_planes,
                 out_planes,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 relu=True,
                 bn=True,
                 bias_attr=False):
        super(BasicConv, self).__init__()
        self.out_channels = out_planes
        self.conv = nn.Conv2D(
            in_planes,
            out_planes,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias_attr=bias_attr)
        self.bn = nn.BatchNorm2D(
            out_planes,
            epsilon=1e-5,
            momentum=0.01,
            weight_attr=False,
            bias_attr=False) if bn else None
        self.relu = nn.ReLU() if relu else None

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.relu is not None:
            x = self.relu(x)
        return x


class ChannelPool(nn.Layer):
    def forward(self, x):
        return paddle.concat(
            (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)),
            axis=1)


class SpatialGate(nn.Layer):
    def __init__(self):
        super(SpatialGate, self).__init__()
        kernel_size = 7
        self.compress = ChannelPool()
        self.spatial = BasicConv(
            2,
            1,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            relu=False)

    def forward(self, x):
        x_compress = self.compress(x)
        x_out = self.spatial(x_compress)
        scale = F.sigmoid(x_out)  # broadcasting
        return x * scale


def fill_up_weights(up):
    weight = up.weight.numpy()
    f = math.ceil(weight.shape[2] / 2)
    c = (2 * f - 1 - f % 2) / (2. * f)
    for i in range(weight.shape[2]):
        for j in range(weight.shape[3]):
            weight[0, 0, i, j] = \
                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
    for c in range(1, weight.shape[0]):
        weight[c, 0, :, :] = weight[0, 0, :, :]
    up.weight.set_value(weight)


class IDAUp(nn.Layer):
    def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True):
        super(IDAUp, self).__init__()
        for i in range(1, len(ch_ins)):
            ch_in = ch_ins[i]
            up_s = int(up_strides[i])
            fan_in = ch_in * 3 * 3
            stdv = 1. / math.sqrt(fan_in)
            proj = nn.Sequential(
                ConvNormLayer(
                    ch_in,
                    ch_out,
                    filter_size=3,
                    stride=1,
                    use_dcn=dcn_v2,
                    bias_on=dcn_v2,
                    norm_decay=None,
                    dcn_lr_scale=1.,
                    dcn_regularizer=None,
                    initializer=Uniform(-stdv, stdv)),
                nn.ReLU())
            node = nn.Sequential(
                ConvNormLayer(
                    ch_out,
                    ch_out,
                    filter_size=3,
                    stride=1,
                    use_dcn=dcn_v2,
                    bias_on=dcn_v2,
                    norm_decay=None,
                    dcn_lr_scale=1.,
                    dcn_regularizer=None,
                    initializer=Uniform(-stdv, stdv)),
                nn.ReLU())

            kernel_size = up_s * 2
            fan_in = ch_out * kernel_size * kernel_size
            stdv = 1. / math.sqrt(fan_in)
            up = nn.Conv2DTranspose(
                ch_out,
                ch_out,
                kernel_size=up_s * 2,
                stride=up_s,
                padding=up_s // 2,
                groups=ch_out,
                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
                bias_attr=False)
            fill_up_weights(up)
            setattr(self, 'proj_' + str(i), proj)
            setattr(self, 'up_' + str(i), up)
            setattr(self, 'node_' + str(i), node)

    def forward(self, inputs, start_level, end_level):
        for i in range(start_level + 1, end_level):
            upsample = getattr(self, 'up_' + str(i - start_level))
            project = getattr(self, 'proj_' + str(i - start_level))
            inputs[i] = project(inputs[i])
            inputs[i] = upsample(inputs[i])
            node = getattr(self, 'node_' + str(i - start_level))
            inputs[i] = node(paddle.add(inputs[i], inputs[i - 1]))
        return inputs


class DLAUp(nn.Layer):
    def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True):
        super(DLAUp, self).__init__()
        self.start_level = start_level
        if ch_in is None:
            ch_in = channels
        self.channels = channels
        channels = list(channels)
        scales = np.array(scales, dtype=int)
        for i in range(len(channels) - 1):
            j = -i - 2
            setattr(
                self,
                'ida_{}'.format(i),
                IDAUp(
                    ch_in[j:],
                    channels[j],
                    scales[j:] // scales[j],
                    dcn_v2=dcn_v2))
            scales[j + 1:] = scales[j]
            ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]]

    def forward(self, inputs):
        out = [inputs[-1]]  # start with 32
        for i in range(len(inputs) - self.start_level - 1):
            ida = getattr(self, 'ida_{}'.format(i))
            outputs = ida(inputs, len(inputs) - i - 2, len(inputs))
            out.insert(0, outputs[-1])
        return out


@register
@serializable
class CenterNetDLAFPN(nn.Layer):
    """
    Args:
        in_channels (list): number of input feature channels from backbone.
            [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34
        down_ratio (int): the down ratio from images to heatmap, 4 by default
        last_level (int): the last level of input feature fed into the upsamplng block
        out_channel (int): the channel of the output feature, 0 by default means
            the channel of the input feature whose down ratio is `down_ratio`
        first_level (None): the first level of input feature fed into the upsamplng block.
            if None, the first level stands for logs(down_ratio)
        dcn_v2 (bool): whether use the DCNv2, True by default
        with_sge (bool): whether use SGE attention, False by default
    """

    def __init__(self,
                 in_channels,
                 down_ratio=4,
                 last_level=5,
                 out_channel=0,
                 first_level=None,
                 dcn_v2=True,
                 with_sge=False):
        super(CenterNetDLAFPN, self).__init__()
        self.first_level = int(np.log2(
            down_ratio)) if first_level is None else first_level
        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
            self.first_level)
        self.down_ratio = down_ratio
        self.last_level = last_level
        scales = [2**i for i in range(len(in_channels[self.first_level:]))]
        self.dla_up = DLAUp(
            self.first_level,
            in_channels[self.first_level:],
            scales,
            dcn_v2=dcn_v2)
        self.out_channel = out_channel
        if out_channel == 0:
            self.out_channel = in_channels[self.first_level]
        self.ida_up = IDAUp(
            in_channels[self.first_level:self.last_level],
            self.out_channel,
            [2**i for i in range(self.last_level - self.first_level)],
            dcn_v2=dcn_v2)

        self.with_sge = with_sge
        if self.with_sge:
            self.sge_attention = SpatialGate()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape]}

    def forward(self, body_feats):

        inputs = [body_feats[i] for i in range(len(body_feats))]

        dla_up_feats = self.dla_up(inputs)

        ida_up_feats = []
        for i in range(self.last_level - self.first_level):
            ida_up_feats.append(dla_up_feats[i].clone())

        self.ida_up(ida_up_feats, 0, len(ida_up_feats))

        feat = ida_up_feats[-1]
        if self.with_sge:
            feat = self.sge_attention(feat)
        if self.down_ratio != 4:
            feat = F.interpolate(
                feat,
                scale_factor=self.down_ratio // 4,
                mode="bilinear",
                align_corners=True)
        return feat

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]


class TransitionUp(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super().__init__()

    def forward(self, x, skip):
        w, h = skip.shape[2], skip.shape[3]
        out = F.interpolate(x, size=(w, h), mode="bilinear", align_corners=True)
        out = paddle.concat([out, skip], 1)
        return out


@register
@serializable
class CenterNetHarDNetFPN(nn.Layer):
    """
    Args:
        in_channels (list): number of input feature channels from backbone.
            [96, 214, 458, 784] by default, means the channels of HarDNet85
        num_layers (int): HarDNet laters, 85 by default
        down_ratio (int): the down ratio from images to heatmap, 4 by default
        first_level (int|None): the first level of input feature fed into the upsamplng block.
            if None, the first level stands for logs(down_ratio) - 1

        last_level (int): the last level of input feature fed into the upsamplng block
        out_channel (int): the channel of the output feature, 0 by default means
            the channel of the input feature whose down ratio is `down_ratio`
    """

    def __init__(self,
                 in_channels,
                 num_layers=85,
                 down_ratio=4,
                 first_level=None,
                 last_level=4,
                 out_channel=0):
        super(CenterNetHarDNetFPN, self).__init__()
        self.first_level = int(np.log2(
            down_ratio)) - 1 if first_level is None else first_level
        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
            self.first_level)
        self.down_ratio = down_ratio
        self.last_level = last_level
        self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2)

        assert num_layers in [68, 85], "HarDNet-{} not support.".format(
            num_layers)
        if num_layers == 85:
            self.last_proj = ConvLayer(784, 256, kernel_size=1)
            self.last_blk = HarDBlock(768, 80, 1.7, 8)
            self.skip_nodes = [1, 3, 8, 13]
            self.SC = [32, 32, 0]
            gr = [64, 48, 28]
            layers = [8, 8, 4]
            ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]]
            channels = [96, 214, 458, 784]
            self.skip_lv = 3

        elif num_layers == 68:
            self.last_proj = ConvLayer(654, 192, kernel_size=1)
            self.last_blk = HarDBlock(576, 72, 1.7, 8)
            self.skip_nodes = [1, 3, 8, 11]
            self.SC = [32, 32, 0]
            gr = [48, 32, 20]
            layers = [8, 8, 4]
            ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]]
            channels = [64, 124, 328, 654]
            self.skip_lv = 2

        self.transUpBlocks = nn.LayerList([])
        self.denseBlocksUp = nn.LayerList([])
        self.conv1x1_up = nn.LayerList([])
        self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4))
        prev_ch = self.last_blk.get_out_ch()

        for i in range(3):
            skip_ch = channels[3 - i]
            self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch))
            if i < self.skip_lv:
                cur_ch = prev_ch + skip_ch
            else:
                cur_ch = prev_ch
            self.conv1x1_up.append(
                ConvLayer(
                    cur_ch, ch_list2[i], kernel_size=1))
            cur_ch = ch_list2[i]
            cur_ch -= self.SC[i]
            cur_ch *= 3

            blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i])
            self.denseBlocksUp.append(blk)
            prev_ch = blk.get_out_ch()

        prev_ch += self.SC[0] + self.SC[1] + self.SC[2]
        self.out_channel = prev_ch

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape]}

    def forward(self, body_feats):
        x = body_feats[-1]
        x_sc = []
        x = self.last_proj(x)
        x = self.last_pool(x)
        x2 = self.avg9x9(x)
        x3 = x / (x.sum((2, 3), keepdim=True) + 0.1)
        x = paddle.concat([x, x2, x3], 1)
        x = self.last_blk(x)

        for i in range(3):
            skip_x = body_feats[3 - i]
            x_up = self.transUpBlocks[i](x, skip_x)
            x_ch = self.conv1x1_up[i](x_up)
            if self.SC[i] > 0:
                end = x_ch.shape[1]
                new_st = end - self.SC[i]
                x_sc.append(x_ch[:, new_st:, :, :])
                x_ch = x_ch[:, :new_st, :, :]
            x2 = self.avg9x9(x_ch)
            x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1)
            x_new = paddle.concat([x_ch, x2, x3], 1)
            x = self.denseBlocksUp[i](x_new)

        scs = [x]
        for i in range(3):
            if self.SC[i] > 0:
                scs.insert(
                    0,
                    F.interpolate(
                        x_sc[i],
                        size=(x.shape[2], x.shape[3]),
                        mode="bilinear",
                        align_corners=True))
        neck_feat = paddle.concat(scs, 1)
        return neck_feat

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]


================================================
FILE: ppdet/modeling/necks/channel_mapper.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
"""
this code is base on mmdet: git@github.com:open-mmlab/mmdetection.git
"""
import paddle.nn as nn

from ppdet.core.workspace import register, serializable
from ..backbones.hrnet import ConvNormLayer
from ..shape_spec import ShapeSpec
from ..initializer import xavier_uniform_, constant_

__all__ = ['ChannelMapper']


@register
@serializable
class ChannelMapper(nn.Layer):
    """Channel Mapper to reduce/increase channels of backbone features.

    This is used to reduce/increase channels of backbone features.

    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale).
        kernel_size (int, optional): kernel_size for reducing channels (used
            at each scale). Default: 3.
        conv_cfg (dict, optional): Config dict for convolution layer.
            Default: None.
        norm_cfg (dict, optional): Config dict for normalization layer.
            Default: None.
        act_cfg (dict, optional): Config dict for activation layer in
            ConvModule. Default: dict(type='ReLU').
        num_outs (int, optional): Number of output feature maps. There
            would be extra_convs when num_outs larger than the length
            of in_channels.
        init_cfg (dict or list[dict], optional): Initialization config dict.
        
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 norm_type="gn",
                 norm_groups=32,
                 act='relu',
                 num_outs=None,
                 init_cfg=dict(
                     type='Xavier', layer='Conv2d', distribution='uniform')):
        super(ChannelMapper, self).__init__()
        assert isinstance(in_channels, list)
        self.extra_convs = None
        if num_outs is None:
            num_outs = len(in_channels)
        self.convs = nn.LayerList()
        for in_channel in in_channels:
            self.convs.append(
                ConvNormLayer(
                    ch_in=in_channel,
                    ch_out=out_channels,
                    filter_size=kernel_size,
                    norm_type='gn',
                    norm_groups=32,
                    act=act))

        if num_outs > len(in_channels):
            self.extra_convs = nn.LayerList()
            for i in range(len(in_channels), num_outs):
                if i == len(in_channels):
                    in_channel = in_channels[-1]
                else:
                    in_channel = out_channels
                self.extra_convs.append(
                    ConvNormLayer(
                        ch_in=in_channel,
                        ch_out=out_channels,
                        filter_size=3,
                        stride=2,
                        norm_type='gn',
                        norm_groups=32,
                        act=act))
        self.init_weights()

    def forward(self, inputs):
        """Forward function."""
        assert len(inputs) == len(self.convs)
        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
        if self.extra_convs:
            for i in range(len(self.extra_convs)):
                if i == 0:
                    outs.append(self.extra_convs[0](inputs[-1]))
                else:
                    outs.append(self.extra_convs[i](outs[-1]))
        return tuple(outs)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channel, stride=1. / s)
            for s in self.spatial_scales
        ]

    def init_weights(self):
        """Initialize the transformer weights."""
        for p in self.parameters():
            if p.rank() > 1:
                xavier_uniform_(p)
                if hasattr(p, 'bias') and p.bias is not None:
                    constant_(p.bais)


================================================
FILE: ppdet/modeling/necks/clrnet_fpn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import XavierUniform
from ppdet.modeling.initializer import kaiming_normal_, constant_
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ppdet.modeling.shape_spec import ShapeSpec

__all__ = ['CLRFPN']


@register
@serializable
class CLRFPN(nn.Layer):
    """
    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
    Args:
        in_channels (list[int]): input channels of each level which can be 
            derived from the output shape of backbone by from_config
        out_channel (int): output channel of each level
        spatial_scales (list[float]): the spatial scales between input feature
            maps and original input image which can be derived from the output 
            shape of backbone by from_config
        has_extra_convs (bool): whether to add extra conv to the last level.
            default False
        extra_stage (int): the number of extra stages added to the last level.
            default 1
        use_c5 (bool): Whether to use c5 as the input of extra stage, 
            otherwise p5 is used. default True
        norm_type (string|None): The normalization type in FPN module. If 
            norm_type is None, norm will not be used after conv and if 
            norm_type is string, bn, gn, sync_bn are available. default None
        norm_decay (float): weight decay for normalization layer weights.
            default 0.
        freeze_norm (bool): whether to freeze normalization layer.  
            default False
        relu_before_extra_convs (bool): whether to add relu before extra convs.
            default False
        
    """

    def __init__(self,
                 in_channels,
                 out_channel,
                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
                 has_extra_convs=False,
                 extra_stage=1,
                 use_c5=True,
                 norm_type=None,
                 norm_decay=0.,
                 freeze_norm=False,
                 relu_before_extra_convs=True):
        super(CLRFPN, self).__init__()
        self.out_channel = out_channel
        for s in range(extra_stage):
            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
        self.spatial_scales = spatial_scales
        self.has_extra_convs = has_extra_convs
        self.extra_stage = extra_stage
        self.use_c5 = use_c5
        self.relu_before_extra_convs = relu_before_extra_convs
        self.norm_type = norm_type
        self.norm_decay = norm_decay
        self.freeze_norm = freeze_norm
        self.in_channels = in_channels
        self.lateral_convs = []
        self.fpn_convs = []
        fan = out_channel * 3 * 3

        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
        # 0 <= st_stage < ed_stage <= 3
        st_stage = 4 - len(in_channels)
        ed_stage = st_stage + len(in_channels) - 1

        for i in range(st_stage, ed_stage + 1):
            # if i == 3:
            #     lateral_name = 'fpn_inner_res5_sum'
            # else:
            #     lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
            lateral_name = "lateral_convs.{}.conv".format(i - 1)
            in_c = in_channels[i - st_stage]
            if self.norm_type is not None:
                lateral = self.add_sublayer(
                    lateral_name,
                    ConvNormLayer(
                        ch_in=in_c,
                        ch_out=out_channel,
                        filter_size=1,
                        stride=1,
                        norm_type=self.norm_type,
                        norm_decay=self.norm_decay,
                        freeze_norm=self.freeze_norm,
                        initializer=XavierUniform(fan_out=in_c)))
            else:
                lateral = self.add_sublayer(
                    lateral_name,
                    nn.Conv2D(
                        in_channels=in_c,
                        out_channels=out_channel,
                        kernel_size=1,
                        weight_attr=ParamAttr(
                            initializer=XavierUniform(fan_out=in_c))))
            self.lateral_convs.append(lateral)

            fpn_name = "fpn_convs.{}.conv".format(i - 1)
            if self.norm_type is not None:
                fpn_conv = self.add_sublayer(
                    fpn_name,
                    ConvNormLayer(
                        ch_in=out_channel,
                        ch_out=out_channel,
                        filter_size=3,
                        stride=1,
                        norm_type=self.norm_type,
                        norm_decay=self.norm_decay,
                        freeze_norm=self.freeze_norm,
                        initializer=XavierUniform(fan_out=fan)))
            else:
                fpn_conv = self.add_sublayer(
                    fpn_name,
                    nn.Conv2D(
                        in_channels=out_channel,
                        out_channels=out_channel,
                        kernel_size=3,
                        padding=1,
                        weight_attr=ParamAttr(
                            initializer=XavierUniform(fan_out=fan))))
            self.fpn_convs.append(fpn_conv)

        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
        if self.has_extra_convs:
            for i in range(self.extra_stage):
                lvl = ed_stage + 1 + i
                if i == 0 and self.use_c5:
                    in_c = in_channels[-1]
                else:
                    in_c = out_channel
                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
                if self.norm_type is not None:
                    extra_fpn_conv = self.add_sublayer(
                        extra_fpn_name,
                        ConvNormLayer(
                            ch_in=in_c,
                            ch_out=out_channel,
                            filter_size=3,
                            stride=2,
                            norm_type=self.norm_type,
                            norm_decay=self.norm_decay,
                            freeze_norm=self.freeze_norm,
                            initializer=XavierUniform(fan_out=fan)))
                else:
                    extra_fpn_conv = self.add_sublayer(
                        extra_fpn_name,
                        nn.Conv2D(
                            in_channels=in_c,
                            out_channels=out_channel,
                            kernel_size=3,
                            stride=2,
                            padding=1,
                            weight_attr=ParamAttr(
                                initializer=XavierUniform(fan_out=fan))))
                self.fpn_convs.append(extra_fpn_conv)
        self.init_weights()

    def init_weights(self):
        for m in self.lateral_convs:
            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
                kaiming_normal_(
                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    constant_(m.bias, value=0.)
            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
                constant_(m.weight, value=1)
                constant_(m.bias, value=0)
        for m in self.fpn_convs:
            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
                kaiming_normal_(
                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    constant_(m.bias, value=0.)
            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
                constant_(m.weight, value=1)
                constant_(m.bias, value=0)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {}

    def forward(self, body_feats):
        laterals = []
        if len(body_feats) > len(self.in_channels):
            for _ in range(len(body_feats) - len(self.in_channels)):
                del body_feats[0]
        num_levels = len(body_feats)
        # print("body_feats",num_levels)
        for i in range(num_levels):
            laterals.append(self.lateral_convs[i](body_feats[i]))

        for i in range(1, num_levels):
            lvl = num_levels - i
            upsample = F.interpolate(
                laterals[lvl],
                scale_factor=2.,
                mode='nearest', )
            laterals[lvl - 1] += upsample

        fpn_output = []
        for lvl in range(num_levels):
            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))

        if self.extra_stage > 0:
            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
            if not self.has_extra_convs:
                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
            else:
                if self.use_c5:
                    extra_source = body_feats[-1]
                else:
                    extra_source = fpn_output[-1]
                fpn_output.append(self.fpn_convs[num_levels](extra_source))

                for i in range(1, self.extra_stage):
                    if self.relu_before_extra_convs:
                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
                            fpn_output[-1])))
                    else:
                        fpn_output.append(self.fpn_convs[num_levels + i](
                            fpn_output[-1]))
        return fpn_output

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channel, stride=1. / s)
            for s in self.spatial_scales
        ]


================================================
FILE: ppdet/modeling/necks/csp_pan.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['CSPPAN']


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channel=96,
                 out_channel=96,
                 kernel_size=3,
                 stride=1,
                 groups=1,
                 act='leaky_relu'):
        super(ConvBNLayer, self).__init__()
        initializer = nn.initializer.KaimingUniform()
        self.conv = nn.Conv2D(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=kernel_size,
            groups=groups,
            padding=(kernel_size - 1) // 2,
            stride=stride,
            weight_attr=ParamAttr(initializer=initializer),
            bias_attr=False)
        self.bn = nn.BatchNorm2D(out_channel)
        if act == "hard_swish":
            act = 'hardswish'
        self.act = act

    def forward(self, x):
        x = self.bn(self.conv(x))
        if self.act:
            x = getattr(F, self.act)(x)
        return x


class DPModule(nn.Layer):
    """
    Depth-wise and point-wise module.
     Args:
        in_channel (int): The input channels of this Module.
        out_channel (int): The output channels of this Module.
        kernel_size (int): The conv2d kernel size of this Module.
        stride (int): The conv2d's stride of this Module.
        act (str): The activation function of this Module,
                   Now support `leaky_relu` and `hard_swish`.
    """

    def __init__(self,
                 in_channel=96,
                 out_channel=96,
                 kernel_size=3,
                 stride=1,
                 act='leaky_relu',
                 use_act_in_out=True):
        super(DPModule, self).__init__()
        initializer = nn.initializer.KaimingUniform()
        self.use_act_in_out = use_act_in_out
        self.dwconv = nn.Conv2D(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=kernel_size,
            groups=out_channel,
            padding=(kernel_size - 1) // 2,
            stride=stride,
            weight_attr=ParamAttr(initializer=initializer),
            bias_attr=False)
        self.bn1 = nn.BatchNorm2D(out_channel)
        self.pwconv = nn.Conv2D(
            in_channels=out_channel,
            out_channels=out_channel,
            kernel_size=1,
            groups=1,
            padding=0,
            weight_attr=ParamAttr(initializer=initializer),
            bias_attr=False)
        self.bn2 = nn.BatchNorm2D(out_channel)
        if act == "hard_swish":
            act = 'hardswish'
        self.act = act

    def forward(self, x):
        x = self.bn1(self.dwconv(x))
        if self.act:
            x = getattr(F, self.act)(x)
        x = self.bn2(self.pwconv(x))
        if self.use_act_in_out and self.act:
            x = getattr(F, self.act)(x)
        return x


class DarknetBottleneck(nn.Layer):
    """The basic bottleneck block used in Darknet.

    Each Block consists of two ConvModules and the input is added to the
    final output. Each ConvModule is composed of Conv, BN, and act.
    The first convLayer has filter size of 1x1 and the second one has the
    filter size of 3x3.

    Args:
        in_channels (int): The input channels of this Module.
        out_channels (int): The output channels of this Module.
        expansion (int): The kernel size of the convolution. Default: 0.5
        add_identity (bool): Whether to add identity to the out.
            Default: True
        use_depthwise (bool): Whether to use depthwise separable convolution.
            Default: False
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 expansion=0.5,
                 add_identity=True,
                 use_depthwise=False,
                 act="leaky_relu"):
        super(DarknetBottleneck, self).__init__()
        hidden_channels = int(out_channels * expansion)
        conv_func = DPModule if use_depthwise else ConvBNLayer
        self.conv1 = ConvBNLayer(
            in_channel=in_channels,
            out_channel=hidden_channels,
            kernel_size=1,
            act=act)
        self.conv2 = conv_func(
            in_channel=hidden_channels,
            out_channel=out_channels,
            kernel_size=kernel_size,
            stride=1,
            act=act)
        self.add_identity = \
            add_identity and in_channels == out_channels

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.conv2(out)

        if self.add_identity:
            return out + identity
        else:
            return out


class CSPLayer(nn.Layer):
    """Cross Stage Partial Layer.

    Args:
        in_channels (int): The input channels of the CSP layer.
        out_channels (int): The output channels of the CSP layer.
        expand_ratio (float): Ratio to adjust the number of channels of the
            hidden layer. Default: 0.5
        num_blocks (int): Number of blocks. Default: 1
        add_identity (bool): Whether to add identity in blocks.
            Default: True
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: False
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 expand_ratio=0.5,
                 num_blocks=1,
                 add_identity=True,
                 use_depthwise=False,
                 act="leaky_relu"):
        super().__init__()
        mid_channels = int(out_channels * expand_ratio)
        self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
        self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
        self.final_conv = ConvBNLayer(
            2 * mid_channels, out_channels, 1, act=act)

        self.blocks = nn.Sequential(* [
            DarknetBottleneck(
                mid_channels,
                mid_channels,
                kernel_size,
                1.0,
                add_identity,
                use_depthwise,
                act=act) for _ in range(num_blocks)
        ])

    def forward(self, x):
        x_short = self.short_conv(x)

        x_main = self.main_conv(x)
        x_main = self.blocks(x_main)

        x_final = paddle.concat((x_main, x_short), axis=1)
        return self.final_conv(x_final)


class Channel_T(nn.Layer):
    def __init__(self,
                 in_channels=[116, 232, 464],
                 out_channels=96,
                 act="leaky_relu"):
        super(Channel_T, self).__init__()
        self.convs = nn.LayerList()
        for i in range(len(in_channels)):
            self.convs.append(
                ConvBNLayer(
                    in_channels[i], out_channels, 1, act=act))

    def forward(self, x):
        outs = [self.convs[i](x[i]) for i in range(len(x))]
        return outs


@register
@serializable
class CSPPAN(nn.Layer):
    """Path Aggregation Network with CSP module.

    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale)
        kernel_size (int): The conv2d kernel size of this Module.
        num_features (int): Number of output features of CSPPAN module.
        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: True
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=5,
                 num_features=3,
                 num_csp_blocks=1,
                 use_depthwise=True,
                 act='hard_swish',
                 spatial_scales=[0.125, 0.0625, 0.03125]):
        super(CSPPAN, self).__init__()
        self.conv_t = Channel_T(in_channels, out_channels, act=act)
        in_channels = [out_channels] * len(spatial_scales)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.spatial_scales = spatial_scales
        self.num_features = num_features
        conv_func = DPModule if use_depthwise else ConvBNLayer

        if self.num_features == 4:
            self.first_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.second_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.spatial_scales.append(self.spatial_scales[-1] / 2)

        # build top-down blocks
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.top_down_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.top_down_blocks.append(
                CSPLayer(
                    in_channels[idx - 1] * 2,
                    in_channels[idx - 1],
                    kernel_size=kernel_size,
                    num_blocks=num_csp_blocks,
                    add_identity=False,
                    use_depthwise=use_depthwise,
                    act=act))

        # build bottom-up blocks
        self.downsamples = nn.LayerList()
        self.bottom_up_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1):
            self.downsamples.append(
                conv_func(
                    in_channels[idx],
                    in_channels[idx],
                    kernel_size=kernel_size,
                    stride=2,
                    act=act))
            self.bottom_up_blocks.append(
                CSPLayer(
                    in_channels[idx] * 2,
                    in_channels[idx + 1],
                    kernel_size=kernel_size,
                    num_blocks=num_csp_blocks,
                    add_identity=False,
                    use_depthwise=use_depthwise,
                    act=act))

    def forward(self, inputs):
        """
        Args:
            inputs (tuple[Tensor]): input features.

        Returns:
            tuple[Tensor]: CSPPAN features.
        """
        assert len(inputs) == len(self.in_channels)
        inputs = self.conv_t(inputs)

        # top-down path
        inner_outs = [inputs[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = inputs[idx - 1]

            upsample_feat = self.upsample(feat_heigh)

            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
                paddle.concat([upsample_feat, feat_low], 1))
            inner_outs.insert(0, inner_out)

        # bottom-up path
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsamples[idx](feat_low)
            out = self.bottom_up_blocks[idx](paddle.concat(
                [downsample_feat, feat_height], 1))
            outs.append(out)

        top_features = None
        if self.num_features == 4:
            top_features = self.first_top_conv(inputs[-1])
            top_features = top_features + self.second_top_conv(outs[-1])
            outs.append(top_features)

        return tuple(outs)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channels, stride=1. / s)
            for s in self.spatial_scales
        ]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }


================================================
FILE: ppdet/modeling/necks/custom_pan.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import copy
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import DropBlock, MultiHeadAttention
from ppdet.modeling.ops import get_act_fn
from ..backbones.cspresnet import ConvBNLayer, BasicBlock
from ..shape_spec import ShapeSpec
from ..initializer import linear_init_

__all__ = ['CustomCSPPAN']


def _get_clones(module, N):
    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])


class SPP(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 k,
                 pool_size,
                 act='swish',
                 data_format='NCHW'):
        super(SPP, self).__init__()
        self.pool = []
        self.data_format = data_format
        for i, size in enumerate(pool_size):
            pool = self.add_sublayer(
                'pool{}'.format(i),
                nn.MaxPool2D(
                    kernel_size=size,
                    stride=1,
                    padding=size // 2,
                    data_format=data_format,
                    ceil_mode=False))
            self.pool.append(pool)
        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)

    def forward(self, x):
        outs = [x]
        for pool in self.pool:
            outs.append(pool(x))
        if self.data_format == 'NCHW':
            y = paddle.concat(outs, axis=1)
        else:
            y = paddle.concat(outs, axis=-1)

        y = self.conv(y)
        return y


class CSPStage(nn.Layer):
    def __init__(self,
                 block_fn,
                 ch_in,
                 ch_out,
                 n,
                 act='swish',
                 spp=False,
                 use_alpha=False):
        super(CSPStage, self).__init__()

        ch_mid = int(ch_out // 2)
        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
        self.convs = nn.Sequential()
        next_ch_in = ch_mid
        for i in range(n):
            self.convs.add_sublayer(
                str(i),
                eval(block_fn)(next_ch_in,
                               ch_mid,
                               act=act,
                               shortcut=False,
                               use_alpha=use_alpha))
            if i == (n - 1) // 2 and spp:
                self.convs.add_sublayer(
                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
            next_ch_in = ch_mid
        self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act)

    def forward(self, x):
        y1 = self.conv1(x)
        y2 = self.conv2(x)
        y2 = self.convs(y2)
        y = paddle.concat([y1, y2], axis=1)
        y = self.conv3(y)
        return y


class TransformerEncoderLayer(nn.Layer):
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(TransformerEncoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self, src, src_mask=None, pos_embed=None):
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        q = k = self.with_pos_embed(src, pos_embed)
        src = self.self_attn(q, k, value=src, attn_mask=src_mask)

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)
        return src


class TransformerEncoder(nn.Layer):
    def __init__(self, encoder_layer, num_layers, norm=None):
        super(TransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, src, src_mask=None, pos_embed=None):
        output = src
        for layer in self.layers:
            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)

        if self.norm is not None:
            output = self.norm(output)

        return output


@register
@serializable
class CustomCSPPAN(nn.Layer):
    __shared__ = [
        'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt',
        'eval_size'
    ]

    def __init__(self,
                 in_channels=[256, 512, 1024],
                 out_channels=[1024, 512, 256],
                 norm_type='bn',
                 act='leaky',
                 stage_fn='CSPStage',
                 block_fn='BasicBlock',
                 stage_num=1,
                 block_num=3,
                 drop_block=False,
                 block_size=3,
                 keep_prob=0.9,
                 spp=False,
                 data_format='NCHW',
                 width_mult=1.0,
                 depth_mult=1.0,
                 use_alpha=False,
                 trt=False,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation='gelu',
                 nhead=4,
                 num_layers=4,
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 use_trans=False,
                 eval_size=None):

        super(CustomCSPPAN, self).__init__()
        out_channels = [max(round(c * width_mult), 1) for c in out_channels]
        block_num = max(round(block_num * depth_mult), 1)
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        self.num_blocks = len(in_channels)
        self.data_format = data_format
        self._out_channels = out_channels

        self.hidden_dim = in_channels[-1]
        in_channels = in_channels[::-1]

        self.use_trans = use_trans
        self.eval_size = eval_size
        if use_trans:
            if eval_size is not None:
                self.pos_embed = self.build_2d_sincos_position_embedding(
                    eval_size[1] // 32,
                    eval_size[0] // 32,
                    embed_dim=self.hidden_dim)
            else:
                self.pos_embed = None

            encoder_layer = TransformerEncoderLayer(
                self.hidden_dim, nhead, dim_feedforward, dropout, activation,
                attn_dropout, act_dropout, normalize_before)
            encoder_norm = nn.LayerNorm(
                self.hidden_dim) if normalize_before else None
            self.encoder = TransformerEncoder(encoder_layer, num_layers,
                                              encoder_norm)

        fpn_stages = []
        fpn_routes = []
        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
            if i > 0:
                ch_in += ch_pre // 2

            stage = nn.Sequential()
            for j in range(stage_num):
                stage.add_sublayer(
                    str(j),
                    eval(stage_fn)(block_fn,
                                   ch_in if j == 0 else ch_out,
                                   ch_out,
                                   block_num,
                                   act=act,
                                   spp=(spp and i == 0),
                                   use_alpha=use_alpha))

            if drop_block:
                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))

            fpn_stages.append(stage)

            if i < self.num_blocks - 1:
                fpn_routes.append(
                    ConvBNLayer(
                        ch_in=ch_out,
                        ch_out=ch_out // 2,
                        filter_size=1,
                        stride=1,
                        padding=0,
                        act=act))

            ch_pre = ch_out

        self.fpn_stages = nn.LayerList(fpn_stages)
        self.fpn_routes = nn.LayerList(fpn_routes)

        pan_stages = []
        pan_routes = []
        for i in reversed(range(self.num_blocks - 1)):
            pan_routes.append(
                ConvBNLayer(
                    ch_in=out_channels[i + 1],
                    ch_out=out_channels[i + 1],
                    filter_size=3,
                    stride=2,
                    padding=1,
                    act=act))

            ch_in = out_channels[i] + out_channels[i + 1]
            ch_out = out_channels[i]
            stage = nn.Sequential()
            for j in range(stage_num):
                stage.add_sublayer(
                    str(j),
                    eval(stage_fn)(block_fn,
                                   ch_in if j == 0 else ch_out,
                                   ch_out,
                                   block_num,
                                   act=act,
                                   spp=False,
                                   use_alpha=use_alpha))
            if drop_block:
                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))

            pan_stages.append(stage)

        self.pan_stages = nn.LayerList(pan_stages[::-1])
        self.pan_routes = nn.LayerList(pan_routes[::-1])

    def build_2d_sincos_position_embedding(
            self,
            w,
            h,
            embed_dim=1024,
            temperature=10000., ):
        grid_w = paddle.arange(int(w), dtype=paddle.float32)
        grid_h = paddle.arange(int(h), dtype=paddle.float32)
        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
        pos_dim = embed_dim // 4
        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
        omega = 1. / (temperature**omega)

        out_w = grid_w.flatten()[..., None] @omega[None]
        out_h = grid_h.flatten()[..., None] @omega[None]

        pos_emb = paddle.concat(
            [
                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
                paddle.cos(out_h)
            ],
            axis=1)[None, :, :]

        return pos_emb

    def forward(self, blocks, for_mot=False):
        if self.use_trans:
            last_feat = blocks[-1]
            n, c, h, w = last_feat.shape

            # flatten [B, C, H, W] to [B, HxW, C]
            src_flatten = last_feat.flatten(2).transpose([0, 2, 1])
            if self.eval_size is not None and not self.training:
                pos_embed = self.pos_embed
            else:
                pos_embed = self.build_2d_sincos_position_embedding(
                    w=w, h=h, embed_dim=self.hidden_dim)

            memory = self.encoder(src_flatten, pos_embed=pos_embed)
            last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w])
            blocks[-1] = last_feat_encode

        blocks = blocks[::-1]
        fpn_feats = []

        for i, block in enumerate(blocks):
            if i > 0:
                block = paddle.concat([route, block], axis=1)
            route = self.fpn_stages[i](block)
            fpn_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.fpn_routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

        pan_feats = [fpn_feats[-1], ]
        route = fpn_feats[-1]
        for i in reversed(range(self.num_blocks - 1)):
            block = fpn_feats[i]
            route = self.pan_routes[i](route)
            block = paddle.concat([route, block], axis=1)
            route = self.pan_stages[i](block)
            pan_feats.append(route)

        return pan_feats[::-1]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/necks/dilated_encoder.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingUniform, Constant, Normal
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec

__all__ = ['DilatedEncoder']


class Bottleneck(nn.Layer):
    def __init__(self, in_channels, mid_channels, dilation):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Sequential(* [
            nn.Conv2D(
                in_channels,
                mid_channels,
                1,
                padding=0,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(0.0))),
            nn.BatchNorm2D(
                mid_channels,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
            nn.ReLU(),
        ])
        self.conv2 = nn.Sequential(* [
            nn.Conv2D(
                mid_channels,
                mid_channels,
                3,
                padding=dilation,
                dilation=dilation,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(0.0))),
            nn.BatchNorm2D(
                mid_channels,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
            nn.ReLU(),
        ])
        self.conv3 = nn.Sequential(* [
            nn.Conv2D(
                mid_channels,
                in_channels,
                1,
                padding=0,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0, std=0.01)),
                bias_attr=ParamAttr(initializer=Constant(0.0))),
            nn.BatchNorm2D(
                in_channels,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
            nn.ReLU(),
        ])

    def forward(self, x):
        identity = x
        y = self.conv3(self.conv2(self.conv1(x)))
        return y + identity


@register
class DilatedEncoder(nn.Layer):
    """
    DilatedEncoder used in YOLOF
    """

    def __init__(self,
                 in_channels=[2048],
                 out_channels=[512],
                 block_mid_channels=128,
                 num_residual_blocks=4,
                 block_dilations=[2, 4, 6, 8]):
        super(DilatedEncoder, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        assert len(self.in_channels) == 1, "YOLOF only has one level feature."
        assert len(self.out_channels) == 1, "YOLOF only has one level feature."

        self.block_mid_channels = block_mid_channels
        self.num_residual_blocks = num_residual_blocks
        self.block_dilations = block_dilations

        out_ch = self.out_channels[0]
        self.lateral_conv = nn.Conv2D(
            self.in_channels[0],
            out_ch,
            1,
            weight_attr=ParamAttr(initializer=KaimingUniform(
                negative_slope=1, nonlinearity='leaky_relu')),
            bias_attr=ParamAttr(initializer=Constant(value=0.0)))
        self.lateral_norm = nn.BatchNorm2D(
            out_ch,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        self.fpn_conv = nn.Conv2D(
            out_ch,
            out_ch,
            3,
            padding=1,
            weight_attr=ParamAttr(initializer=KaimingUniform(
                negative_slope=1, nonlinearity='leaky_relu')))
        self.fpn_norm = nn.BatchNorm2D(
            out_ch,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        encoder_blocks = []
        for i in range(self.num_residual_blocks):
            encoder_blocks.append(
                Bottleneck(
                    out_ch,
                    self.block_mid_channels,
                    dilation=block_dilations[i]))
        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)

    def forward(self, inputs, for_mot=False):
        out = self.lateral_norm(self.lateral_conv(inputs[0]))
        out = self.fpn_norm(self.fpn_conv(out))
        out = self.dilated_encoder_blocks(out)
        return [out]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self.out_channels]


================================================
FILE: ppdet/modeling/necks/es_pan.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable

from ..shape_spec import ShapeSpec
from ..backbones.esnet import SEModule
from .csp_pan import ConvBNLayer, Channel_T, DPModule

__all__ = ['ESPAN']


class ES_Block(nn.Layer):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 kernel_size=5,
                 stride=1,
                 act='leaky_relu'):
        super(ES_Block, self).__init__()
        self._residual = ConvBNLayer(
            in_channel=in_channels,
            out_channel=out_channels,
            kernel_size=1,
            stride=1,
            groups=1,
            act=act)
        self._conv_pw = ConvBNLayer(
            in_channel=in_channels,
            out_channel=mid_channels // 2,
            kernel_size=1,
            stride=1,
            groups=1,
            act=act)
        self._conv_dw = ConvBNLayer(
            in_channel=mid_channels // 2,
            out_channel=mid_channels // 2,
            kernel_size=kernel_size,
            stride=stride,
            groups=mid_channels // 2,
            act=None)
        self._se = SEModule(mid_channels)

        self._conv_linear = ConvBNLayer(
            in_channel=mid_channels,
            out_channel=out_channels,
            kernel_size=1,
            stride=1,
            groups=1,
            act=act)

        self._out_conv = ConvBNLayer(
            in_channel=out_channels * 2,
            out_channel=out_channels,
            kernel_size=1,
            stride=1,
            groups=1,
            act=act)

    def forward(self, inputs):
        x1 = self._residual(inputs)
        x2 = self._conv_pw(inputs)
        x3 = self._conv_dw(x2)
        x3 = paddle.concat([x2, x3], axis=1)
        x3 = self._se(x3)
        x3 = self._conv_linear(x3)
        out = paddle.concat([x1, x3], axis=1)
        out = self._out_conv(out)
        return out


@register
@serializable
class ESPAN(nn.Layer):
    """Path Aggregation Network with ES module.

    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale)
        kernel_size (int): The conv2d kernel size of this Module.
        num_features (int): Number of output features of CSPPAN module.
        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: True
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=5,
                 num_features=3,
                 use_depthwise=True,
                 act='hard_swish',
                 spatial_scales=[0.125, 0.0625, 0.03125]):
        super(ESPAN, self).__init__()
        self.conv_t = Channel_T(in_channels, out_channels, act=act)
        in_channels = [out_channels] * len(spatial_scales)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.spatial_scales = spatial_scales
        self.num_features = num_features
        conv_func = DPModule if use_depthwise else ConvBNLayer

        if self.num_features == 4:
            self.first_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.second_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.spatial_scales.append(self.spatial_scales[-1] / 2)

        # build top-down blocks
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.top_down_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.top_down_blocks.append(
                ES_Block(
                    in_channels[idx - 1] * 2,
                    in_channels[idx - 1],
                    in_channels[idx - 1],
                    kernel_size=kernel_size,
                    stride=1,
                    act=act))

        # build bottom-up blocks
        self.downsamples = nn.LayerList()
        self.bottom_up_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1):
            self.downsamples.append(
                conv_func(
                    in_channels[idx],
                    in_channels[idx],
                    kernel_size=kernel_size,
                    stride=2,
                    act=act))
            self.bottom_up_blocks.append(
                ES_Block(
                    in_channels[idx] * 2,
                    in_channels[idx + 1],
                    in_channels[idx + 1],
                    kernel_size=kernel_size,
                    stride=1,
                    act=act))

    def forward(self, inputs):
        """
        Args:
            inputs (tuple[Tensor]): input features.

        Returns:
            tuple[Tensor]: CSPPAN features.
        """
        assert len(inputs) == len(self.in_channels)
        inputs = self.conv_t(inputs)

        # top-down path
        inner_outs = [inputs[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = inputs[idx - 1]

            upsample_feat = self.upsample(feat_heigh)

            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
                paddle.concat([upsample_feat, feat_low], 1))
            inner_outs.insert(0, inner_out)

        # bottom-up path
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsamples[idx](feat_low)
            out = self.bottom_up_blocks[idx](paddle.concat(
                [downsample_feat, feat_height], 1))
            outs.append(out)

        top_features = None
        if self.num_features == 4:
            top_features = self.first_top_conv(inputs[-1])
            top_features = top_features + self.second_top_conv(outs[-1])
            outs.append(top_features)

        return tuple(outs)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channels, stride=1. / s)
            for s in self.spatial_scales
        ]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }


================================================
FILE: ppdet/modeling/necks/fpn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import XavierUniform

from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ..shape_spec import ShapeSpec

__all__ = ['FPN']


@register
@serializable
class FPN(nn.Layer):
    """
    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144

    Args:
        in_channels (list[int]): input channels of each level which can be 
            derived from the output shape of backbone by from_config
        out_channel (int): output channel of each level
        spatial_scales (list[float]): the spatial scales between input feature
            maps and original input image which can be derived from the output 
            shape of backbone by from_config
        has_extra_convs (bool): whether to add extra conv to the last level.
            default False
        extra_stage (int): the number of extra stages added to the last level.
            default 1
        use_c5 (bool): Whether to use c5 as the input of extra stage, 
            otherwise p5 is used. default True
        norm_type (string|None): The normalization type in FPN module. If 
            norm_type is None, norm will not be used after conv and if 
            norm_type is string, bn, gn, sync_bn are available. default None
        norm_decay (float): weight decay for normalization layer weights.
            default 0.
        freeze_norm (bool): whether to freeze normalization layer.  
            default False
        relu_before_extra_convs (bool): whether to add relu before extra convs.
            default False
        
    """

    def __init__(self,
                 in_channels,
                 out_channel,
                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
                 has_extra_convs=False,
                 extra_stage=1,
                 use_c5=True,
                 norm_type=None,
                 norm_decay=0.,
                 freeze_norm=False,
                 relu_before_extra_convs=True):
        super(FPN, self).__init__()
        self.out_channel = out_channel
        for s in range(extra_stage):
            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
        self.spatial_scales = spatial_scales
        self.has_extra_convs = has_extra_convs
        self.extra_stage = extra_stage
        self.use_c5 = use_c5
        self.relu_before_extra_convs = relu_before_extra_convs
        self.norm_type = norm_type
        self.norm_decay = norm_decay
        self.freeze_norm = freeze_norm

        self.lateral_convs = []
        self.fpn_convs = []
        fan = out_channel * 3 * 3

        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
        # 0 <= st_stage < ed_stage <= 3
        st_stage = 4 - len(in_channels)
        ed_stage = st_stage + len(in_channels) - 1
        for i in range(st_stage, ed_stage + 1):
            if i == 3:
                lateral_name = 'fpn_inner_res5_sum'
            else:
                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
            in_c = in_channels[i - st_stage]
            if self.norm_type is not None:
                lateral = self.add_sublayer(
                    lateral_name,
                    ConvNormLayer(
                        ch_in=in_c,
                        ch_out=out_channel,
                        filter_size=1,
                        stride=1,
                        norm_type=self.norm_type,
                        norm_decay=self.norm_decay,
                        freeze_norm=self.freeze_norm,
                        initializer=XavierUniform(fan_out=in_c)))
            else:
                lateral = self.add_sublayer(
                    lateral_name,
                    nn.Conv2D(
                        in_channels=in_c,
                        out_channels=out_channel,
                        kernel_size=1,
                        weight_attr=ParamAttr(
                            initializer=XavierUniform(fan_out=in_c))))
            self.lateral_convs.append(lateral)

            fpn_name = 'fpn_res{}_sum'.format(i + 2)
            if self.norm_type is not None:
                fpn_conv = self.add_sublayer(
                    fpn_name,
                    ConvNormLayer(
                        ch_in=out_channel,
                        ch_out=out_channel,
                        filter_size=3,
                        stride=1,
                        norm_type=self.norm_type,
                        norm_decay=self.norm_decay,
                        freeze_norm=self.freeze_norm,
                        initializer=XavierUniform(fan_out=fan)))
            else:
                fpn_conv = self.add_sublayer(
                    fpn_name,
                    nn.Conv2D(
                        in_channels=out_channel,
                        out_channels=out_channel,
                        kernel_size=3,
                        padding=1,
                        weight_attr=ParamAttr(
                            initializer=XavierUniform(fan_out=fan))))
            self.fpn_convs.append(fpn_conv)

        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
        if self.has_extra_convs:
            for i in range(self.extra_stage):
                lvl = ed_stage + 1 + i
                if i == 0 and self.use_c5:
                    in_c = in_channels[-1]
                else:
                    in_c = out_channel
                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
                if self.norm_type is not None:
                    extra_fpn_conv = self.add_sublayer(
                        extra_fpn_name,
                        ConvNormLayer(
                            ch_in=in_c,
                            ch_out=out_channel,
                            filter_size=3,
                            stride=2,
                            norm_type=self.norm_type,
                            norm_decay=self.norm_decay,
                            freeze_norm=self.freeze_norm,
                            initializer=XavierUniform(fan_out=fan)))
                else:
                    extra_fpn_conv = self.add_sublayer(
                        extra_fpn_name,
                        nn.Conv2D(
                            in_channels=in_c,
                            out_channels=out_channel,
                            kernel_size=3,
                            stride=2,
                            padding=1,
                            weight_attr=ParamAttr(
                                initializer=XavierUniform(fan_out=fan))))
                self.fpn_convs.append(extra_fpn_conv)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
            'spatial_scales': [1.0 / i.stride for i in input_shape],
        }

    def forward(self, body_feats):
        laterals = []
        num_levels = len(body_feats)
        for i in range(num_levels):
            laterals.append(self.lateral_convs[i](body_feats[i]))

        for i in range(1, num_levels):
            lvl = num_levels - i
            upsample = F.interpolate(
                laterals[lvl],
                scale_factor=2.,
                mode='nearest', )
            laterals[lvl - 1] += upsample

        fpn_output = []
        for lvl in range(num_levels):
            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))

        if self.extra_stage > 0:
            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
            if not self.has_extra_convs:
                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
            else:
                if self.use_c5:
                    extra_source = body_feats[-1]
                else:
                    extra_source = fpn_output[-1]
                fpn_output.append(self.fpn_convs[num_levels](extra_source))

                for i in range(1, self.extra_stage):
                    if self.relu_before_extra_convs:
                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
                            fpn_output[-1])))
                    else:
                        fpn_output.append(self.fpn_convs[num_levels + i](
                            fpn_output[-1]))
        return fpn_output

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channel, stride=1. / s)
            for s in self.spatial_scales
        ]


================================================
FILE: ppdet/modeling/necks/hrfpn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec

__all__ = ['HRFPN']


@register
class HRFPN(nn.Layer):
    """
    Args:
        in_channels (list): number of input feature channels from backbone
        out_channel (int): number of output feature channels
        share_conv (bool): whether to share conv for different layers' reduction
        extra_stage (int): add extra stage for returning HRFPN fpn_feats
        spatial_scales (list): feature map scaling factor
    """

    def __init__(self,
                 in_channels=[18, 36, 72, 144],
                 out_channel=256,
                 share_conv=False,
                 extra_stage=1,
                 spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32],
                 use_bias=False):
        super(HRFPN, self).__init__()
        in_channel = sum(in_channels)
        self.in_channel = in_channel
        self.out_channel = out_channel
        self.share_conv = share_conv
        for i in range(extra_stage):
            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
        self.spatial_scales = spatial_scales
        self.num_out = len(self.spatial_scales)
        self.use_bias = use_bias
        bias_attr = False if use_bias is False else None

        self.reduction = nn.Conv2D(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=1,
            bias_attr=bias_attr)

        if share_conv:
            self.fpn_conv = nn.Conv2D(
                in_channels=out_channel,
                out_channels=out_channel,
                kernel_size=3,
                padding=1,
                bias_attr=bias_attr)
        else:
            self.fpn_conv = []
            for i in range(self.num_out):
                conv_name = "fpn_conv_" + str(i)
                conv = self.add_sublayer(
                    conv_name,
                    nn.Conv2D(
                        in_channels=out_channel,
                        out_channels=out_channel,
                        kernel_size=3,
                        padding=1,
                        bias_attr=bias_attr))
                self.fpn_conv.append(conv)

    def forward(self, body_feats):
        num_backbone_stages = len(body_feats)

        outs = []
        outs.append(body_feats[0])

        # resize
        for i in range(1, num_backbone_stages):
            resized = F.interpolate(
                body_feats[i], scale_factor=2**i, mode='bilinear')
            outs.append(resized)

        # concat
        out = paddle.concat(outs, axis=1)
        assert out.shape[
            1] == self.in_channel, 'in_channel should be {}, be received {}'.format(
                out.shape[1], self.in_channel)

        # reduction
        out = self.reduction(out)

        # conv
        outs = [out]
        for i in range(1, self.num_out):
            outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i))
        outputs = []

        for i in range(self.num_out):
            conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i]
            conv = conv_func(outs[i])
            outputs.append(conv)

        fpn_feats = [outputs[k] for k in range(self.num_out)]
        return fpn_feats

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
            'spatial_scales': [1.0 / i.stride for i in input_shape],
        }

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channel, stride=1. / s)
            for s in self.spatial_scales
        ]


================================================
FILE: ppdet/modeling/necks/lc_pan.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable

from ..shape_spec import ShapeSpec
from ..backbones.lcnet import DepthwiseSeparable
from .csp_pan import ConvBNLayer, Channel_T, DPModule

__all__ = ['LCPAN']


@register
@serializable
class LCPAN(nn.Layer):
    """Path Aggregation Network with LCNet module.
    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale)
        kernel_size (int): The conv2d kernel size of this Module.
        num_features (int): Number of output features of CSPPAN module.
        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: True
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=5,
                 num_features=3,
                 use_depthwise=True,
                 act='hard_swish',
                 spatial_scales=[0.125, 0.0625, 0.03125]):
        super(LCPAN, self).__init__()
        self.conv_t = Channel_T(in_channels, out_channels, act=act)
        in_channels = [out_channels] * len(spatial_scales)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.spatial_scales = spatial_scales
        self.num_features = num_features
        conv_func = DPModule if use_depthwise else ConvBNLayer

        NET_CONFIG = {
            #k, in_c, out_c, stride, use_se
            "block1": [
                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
                [kernel_size, out_channels * 2, out_channels, 1, False],
            ],
            "block2": [
                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
                [kernel_size, out_channels * 2, out_channels, 1, False],
            ]
        }

        if self.num_features == 4:
            self.first_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.second_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.spatial_scales.append(self.spatial_scales[-1] / 2)

        # build top-down blocks
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.top_down_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.top_down_blocks.append(
                nn.Sequential(* [
                    DepthwiseSeparable(
                        num_channels=in_c,
                        num_filters=out_c,
                        dw_size=k,
                        stride=s,
                        use_se=se)
                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
                        "block1"])
                ]))

        # build bottom-up blocks
        self.downsamples = nn.LayerList()
        self.bottom_up_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1):
            self.downsamples.append(
                conv_func(
                    in_channels[idx],
                    in_channels[idx],
                    kernel_size=kernel_size,
                    stride=2,
                    act=act))
            self.bottom_up_blocks.append(
                nn.Sequential(* [
                    DepthwiseSeparable(
                        num_channels=in_c,
                        num_filters=out_c,
                        dw_size=k,
                        stride=s,
                        use_se=se)
                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
                        "block2"])
                ]))

    def forward(self, inputs):
        """
        Args:
            inputs (tuple[Tensor]): input features.
        Returns:
            tuple[Tensor]: CSPPAN features.
        """
        assert len(inputs) == len(self.in_channels)
        inputs = self.conv_t(inputs)

        # top-down path
        inner_outs = [inputs[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = inputs[idx - 1]

            upsample_feat = self.upsample(feat_heigh)

            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
                paddle.concat([upsample_feat, feat_low], 1))
            inner_outs.insert(0, inner_out)

        # bottom-up path
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsamples[idx](feat_low)
            out = self.bottom_up_blocks[idx](paddle.concat(
                [downsample_feat, feat_height], 1))
            outs.append(out)

        top_features = None
        if self.num_features == 4:
            top_features = self.first_top_conv(inputs[-1])
            top_features = top_features + self.second_top_conv(outs[-1])
            outs.append(top_features)

        return tuple(outs)

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.out_channels, stride=1. / s)
            for s in self.spatial_scales
        ]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }


================================================
FILE: ppdet/modeling/necks/ttf_fpn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform
from ppdet.core.workspace import register, serializable
from paddle.regularizer import L2Decay
from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv
import math
from ppdet.modeling.ops import batch_norm
from ..shape_spec import ShapeSpec

__all__ = ['TTFFPN']


class Upsample(nn.Layer):
    def __init__(self, ch_in, ch_out, norm_type='bn'):
        super(Upsample, self).__init__()
        fan_in = ch_in * 3 * 3
        stdv = 1. / math.sqrt(fan_in)
        self.dcn = DeformableConvV2(
            ch_in,
            ch_out,
            kernel_size=3,
            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
            bias_attr=ParamAttr(
                initializer=Constant(0),
                regularizer=L2Decay(0.),
                learning_rate=2.),
            lr_scale=2.,
            regularizer=L2Decay(0.))

        self.bn = batch_norm(
            ch_out, norm_type=norm_type, initializer=Constant(1.))

    def forward(self, feat):
        dcn = self.dcn(feat)
        bn = self.bn(dcn)
        relu = F.relu(bn)
        out = F.interpolate(relu, scale_factor=2., mode='bilinear')
        return out


class DeConv(nn.Layer):
    def __init__(self, ch_in, ch_out, norm_type='bn'):
        super(DeConv, self).__init__()
        self.deconv = nn.Sequential()
        conv1 = ConvNormLayer(
            ch_in=ch_in,
            ch_out=ch_out,
            stride=1,
            filter_size=1,
            norm_type=norm_type,
            initializer=XavierUniform())
        conv2 = nn.Conv2DTranspose(
            in_channels=ch_out,
            out_channels=ch_out,
            kernel_size=4,
            padding=1,
            stride=2,
            groups=ch_out,
            weight_attr=ParamAttr(initializer=XavierUniform()),
            bias_attr=False)
        bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.)
        conv3 = ConvNormLayer(
            ch_in=ch_out,
            ch_out=ch_out,
            stride=1,
            filter_size=1,
            norm_type=norm_type,
            initializer=XavierUniform())

        self.deconv.add_sublayer('conv1', conv1)
        self.deconv.add_sublayer('relu6_1', nn.ReLU6())
        self.deconv.add_sublayer('conv2', conv2)
        self.deconv.add_sublayer('bn', bn)
        self.deconv.add_sublayer('relu6_2', nn.ReLU6())
        self.deconv.add_sublayer('conv3', conv3)
        self.deconv.add_sublayer('relu6_3', nn.ReLU6())

    def forward(self, inputs):
        return self.deconv(inputs)


class LiteUpsample(nn.Layer):
    def __init__(self, ch_in, ch_out, norm_type='bn'):
        super(LiteUpsample, self).__init__()
        self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type)
        self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type)

    def forward(self, inputs):
        deconv_up = self.deconv(inputs)
        conv = self.conv(inputs)
        interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear')
        return deconv_up + interp_up


class ShortCut(nn.Layer):
    def __init__(self,
                 layer_num,
                 ch_in,
                 ch_out,
                 norm_type='bn',
                 lite_neck=False,
                 name=None):
        super(ShortCut, self).__init__()
        shortcut_conv = nn.Sequential()
        for i in range(layer_num):
            fan_out = 3 * 3 * ch_out
            std = math.sqrt(2. / fan_out)
            in_channels = ch_in if i == 0 else ch_out
            shortcut_name = name + '.conv.{}'.format(i)
            if lite_neck:
                shortcut_conv.add_sublayer(
                    shortcut_name,
                    LiteConv(
                        in_channels=in_channels,
                        out_channels=ch_out,
                        with_act=i < layer_num - 1,
                        norm_type=norm_type))
            else:
                shortcut_conv.add_sublayer(
                    shortcut_name,
                    nn.Conv2D(
                        in_channels=in_channels,
                        out_channels=ch_out,
                        kernel_size=3,
                        padding=1,
                        weight_attr=ParamAttr(initializer=Normal(0, std)),
                        bias_attr=ParamAttr(
                            learning_rate=2., regularizer=L2Decay(0.))))
                if i < layer_num - 1:
                    shortcut_conv.add_sublayer(shortcut_name + '.act',
                                               nn.ReLU())
        self.shortcut = self.add_sublayer('shortcut', shortcut_conv)

    def forward(self, feat):
        out = self.shortcut(feat)
        return out


@register
@serializable
class TTFFPN(nn.Layer):
    """
    Args:
        in_channels (list): number of input feature channels from backbone.
            [128,256,512,1024] by default, means the channels of DarkNet53
            backbone return_idx [1,2,3,4].
        planes (list): the number of output feature channels of FPN.
            [256, 128, 64] by default
        shortcut_num (list): the number of convolution layers in each shortcut.
            [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs
            in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv.
        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. 
            bn by default
        lite_neck (bool): whether to use lite conv in TTFNet FPN, 
            False by default
        fusion_method (string): the method to fusion upsample and lateral layer.
            'add' and 'concat' are optional, add by default
    """

    __shared__ = ['norm_type']

    def __init__(self,
                 in_channels,
                 planes=[256, 128, 64],
                 shortcut_num=[3, 2, 1],
                 norm_type='bn',
                 lite_neck=False,
                 fusion_method='add'):
        super(TTFFPN, self).__init__()
        self.planes = planes
        self.shortcut_num = shortcut_num[::-1]
        self.shortcut_len = len(shortcut_num)
        self.ch_in = in_channels[::-1]
        self.fusion_method = fusion_method

        self.upsample_list = []
        self.shortcut_list = []
        self.upper_list = []
        for i, out_c in enumerate(self.planes):
            in_c = self.ch_in[i] if i == 0 else self.upper_list[-1]
            upsample_module = LiteUpsample if lite_neck else Upsample
            upsample = self.add_sublayer(
                'upsample.' + str(i),
                upsample_module(
                    in_c, out_c, norm_type=norm_type))
            self.upsample_list.append(upsample)
            if i < self.shortcut_len:
                shortcut = self.add_sublayer(
                    'shortcut.' + str(i),
                    ShortCut(
                        self.shortcut_num[i],
                        self.ch_in[i + 1],
                        out_c,
                        norm_type=norm_type,
                        lite_neck=lite_neck,
                        name='shortcut.' + str(i)))
                self.shortcut_list.append(shortcut)
                if self.fusion_method == 'add':
                    upper_c = out_c
                elif self.fusion_method == 'concat':
                    upper_c = out_c * 2
                else:
                    raise ValueError('Illegal fusion method. Expected add or\
                        concat, but received {}'.format(self.fusion_method))
                self.upper_list.append(upper_c)

    def forward(self, inputs):
        feat = inputs[-1]
        for i, out_c in enumerate(self.planes):
            feat = self.upsample_list[i](feat)
            if i < self.shortcut_len:
                shortcut = self.shortcut_list[i](inputs[-i - 2])
                if self.fusion_method == 'add':
                    feat = feat + shortcut
                else:
                    feat = paddle.concat([feat, shortcut], axis=1)
        return feat

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=self.upper_list[-1], )]


================================================
FILE: ppdet/modeling/necks/yolo_fpn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import DropBlock
from ppdet.modeling.ops import get_act_fn
from ..backbones.darknet import ConvBNLayer
from ..shape_spec import ShapeSpec
from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer

__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN']


def add_coord(x, data_format):
    b = x.shape[0]
    if data_format == 'NCHW':
        h, w = x.shape[2], x.shape[3]
    else:
        h, w = x.shape[1], x.shape[2]

    gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype)
    gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype)

    if data_format == 'NCHW':
        gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w])
        gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w])
    else:
        gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1])
        gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1])

    gx.stop_gradient = True
    gy.stop_gradient = True
    return gx, gy


class YoloDetBlock(nn.Layer):
    def __init__(self,
                 ch_in,
                 channel,
                 norm_type,
                 freeze_norm=False,
                 name='',
                 data_format='NCHW'):
        """
        YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767

        Args:
            ch_in (int): input channel
            channel (int): base channel
            norm_type (str): batch norm type
            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
            data_format (str): data format, NCHW or NHWC
        """
        super(YoloDetBlock, self).__init__()
        self.ch_in = ch_in
        self.channel = channel
        assert channel % 2 == 0, \
            "channel {} cannot be divided by 2".format(channel)
        conv_def = [
            ['conv0', ch_in, channel, 1, '.0.0'],
            ['conv1', channel, channel * 2, 3, '.0.1'],
            ['conv2', channel * 2, channel, 1, '.1.0'],
            ['conv3', channel, channel * 2, 3, '.1.1'],
            ['route', channel * 2, channel, 1, '.2'],
        ]

        self.conv_module = nn.Sequential()
        for idx, (conv_name, ch_in, ch_out, filter_size,
                  post_name) in enumerate(conv_def):
            self.conv_module.add_sublayer(
                conv_name,
                ConvBNLayer(
                    ch_in=ch_in,
                    ch_out=ch_out,
                    filter_size=filter_size,
                    padding=(filter_size - 1) // 2,
                    norm_type=norm_type,
                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name + post_name))

        self.tip = ConvBNLayer(
            ch_in=channel,
            ch_out=channel * 2,
            filter_size=3,
            padding=1,
            norm_type=norm_type,
            freeze_norm=freeze_norm,
            data_format=data_format,
            name=name + '.tip')

    def forward(self, inputs):
        route = self.conv_module(inputs)
        tip = self.tip(route)
        return route, tip


class SPP(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 k,
                 pool_size,
                 norm_type='bn',
                 freeze_norm=False,
                 name='',
                 act='leaky',
                 data_format='NCHW'):
        """
        SPP layer, which consist of four pooling layer follwed by conv layer

        Args:
            ch_in (int): input channel of conv layer
            ch_out (int): output channel of conv layer
            k (int): kernel size of conv layer
            norm_type (str): batch norm type
            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
            act (str): activation function
            data_format (str): data format, NCHW or NHWC
        """
        super(SPP, self).__init__()
        self.pool = []
        self.data_format = data_format
        for size in pool_size:
            pool = self.add_sublayer(
                '{}.pool1'.format(name),
                nn.MaxPool2D(
                    kernel_size=size,
                    stride=1,
                    padding=size // 2,
                    data_format=data_format,
                    ceil_mode=False))
            self.pool.append(pool)
        self.conv = ConvBNLayer(
            ch_in,
            ch_out,
            k,
            padding=k // 2,
            norm_type=norm_type,
            freeze_norm=freeze_norm,
            name=name,
            act=act,
            data_format=data_format)

    def forward(self, x):
        outs = [x]
        for pool in self.pool:
            outs.append(pool(x))
        if self.data_format == "NCHW":
            y = paddle.concat(outs, axis=1)
        else:
            y = paddle.concat(outs, axis=-1)

        y = self.conv(y)
        return y


class CoordConv(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size,
                 padding,
                 norm_type,
                 freeze_norm=False,
                 name='',
                 data_format='NCHW'):
        """
        CoordConv layer, see https://arxiv.org/abs/1807.03247

        Args:
            ch_in (int): input channel
            ch_out (int): output channel
            filter_size (int): filter size, default 3
            padding (int): padding size, default 0
            norm_type (str): batch norm type, default bn
            name (str): layer name
            data_format (str): data format, NCHW or NHWC

        """
        super(CoordConv, self).__init__()
        self.conv = ConvBNLayer(
            ch_in + 2,
            ch_out,
            filter_size=filter_size,
            padding=padding,
            norm_type=norm_type,
            freeze_norm=freeze_norm,
            data_format=data_format,
            name=name)
        self.data_format = data_format

    def forward(self, x):
        gx, gy = add_coord(x, self.data_format)
        if self.data_format == 'NCHW':
            y = paddle.concat([x, gx, gy], axis=1)
        else:
            y = paddle.concat([x, gx, gy], axis=-1)
        y = self.conv(y)
        return y


class PPYOLODetBlock(nn.Layer):
    def __init__(self, cfg, name, data_format='NCHW'):
        """
        PPYOLODetBlock layer

        Args:
            cfg (list): layer configs for this block
            name (str): block name
            data_format (str): data format, NCHW or NHWC
        """
        super(PPYOLODetBlock, self).__init__()
        self.conv_module = nn.Sequential()
        for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]):
            kwargs.update(
                name='{}.{}'.format(name, conv_name), data_format=data_format)
            self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs))

        conv_name, layer, args, kwargs = cfg[-1]
        kwargs.update(
            name='{}.{}'.format(name, conv_name), data_format=data_format)
        self.tip = layer(*args, **kwargs)

    def forward(self, inputs):
        route = self.conv_module(inputs)
        tip = self.tip(route)
        return route, tip


class PPYOLOTinyDetBlock(nn.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 name,
                 drop_block=False,
                 block_size=3,
                 keep_prob=0.9,
                 data_format='NCHW'):
        """
        PPYOLO Tiny DetBlock layer
        Args:
            ch_in (list): input channel number
            ch_out (list): output channel number
            name (str): block name
            drop_block: whether user DropBlock
            block_size: drop block size
            keep_prob: probability to keep block in DropBlock
            data_format (str): data format, NCHW or NHWC
        """
        super(PPYOLOTinyDetBlock, self).__init__()
        self.drop_block_ = drop_block
        self.conv_module = nn.Sequential()

        cfgs = [
            # name, in channels, out channels, filter_size, 
            # stride, padding, groups
            ['.0', ch_in, ch_out, 1, 1, 0, 1],
            ['.1', ch_out, ch_out, 5, 1, 2, ch_out],
            ['.2', ch_out, ch_out, 1, 1, 0, 1],
            ['.route', ch_out, ch_out, 5, 1, 2, ch_out],
        ]
        for cfg in cfgs:
            conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \
                    groups = cfg
            self.conv_module.add_sublayer(
                name + conv_name,
                ConvBNLayer(
                    ch_in=conv_ch_in,
                    ch_out=conv_ch_out,
                    filter_size=filter_size,
                    stride=stride,
                    padding=padding,
                    groups=groups,
                    name=name + conv_name))

        self.tip = ConvBNLayer(
            ch_in=ch_out,
            ch_out=ch_out,
            filter_size=1,
            stride=1,
            padding=0,
            groups=1,
            name=name + conv_name)

        if self.drop_block_:
            self.drop_block = DropBlock(
                block_size=block_size,
                keep_prob=keep_prob,
                data_format=data_format,
                name=name + '.dropblock')

    def forward(self, inputs):
        if self.drop_block_:
            inputs = self.drop_block(inputs)
        route = self.conv_module(inputs)
        tip = self.tip(route)
        return route, tip


class PPYOLODetBlockCSP(nn.Layer):
    def __init__(self,
                 cfg,
                 ch_in,
                 ch_out,
                 act,
                 norm_type,
                 name,
                 data_format='NCHW'):
        """
        PPYOLODetBlockCSP layer

        Args:
            cfg (list): layer configs for this block
            ch_in (int): input channel
            ch_out (int): output channel
            act (str): default mish
            name (str): block name
            data_format (str): data format, NCHW or NHWC
        """
        super(PPYOLODetBlockCSP, self).__init__()
        self.data_format = data_format
        self.conv1 = ConvBNLayer(
            ch_in,
            ch_out,
            1,
            padding=0,
            act=act,
            norm_type=norm_type,
            name=name + '.left',
            data_format=data_format)
        self.conv2 = ConvBNLayer(
            ch_in,
            ch_out,
            1,
            padding=0,
            act=act,
            norm_type=norm_type,
            name=name + '.right',
            data_format=data_format)
        self.conv3 = ConvBNLayer(
            ch_out * 2,
            ch_out * 2,
            1,
            padding=0,
            act=act,
            norm_type=norm_type,
            name=name,
            data_format=data_format)
        self.conv_module = nn.Sequential()
        for idx, (layer_name, layer, args, kwargs) in enumerate(cfg):
            kwargs.update(name=name + layer_name, data_format=data_format)
            self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs))

    def forward(self, inputs):
        conv_left = self.conv1(inputs)
        conv_right = self.conv2(inputs)
        conv_left = self.conv_module(conv_left)
        if self.data_format == 'NCHW':
            conv = paddle.concat([conv_left, conv_right], axis=1)
        else:
            conv = paddle.concat([conv_left, conv_right], axis=-1)

        conv = self.conv3(conv)
        return conv, conv


@register
@serializable
class YOLOv3FPN(nn.Layer):
    __shared__ = ['norm_type', 'data_format']

    def __init__(self,
                 in_channels=[256, 512, 1024],
                 norm_type='bn',
                 freeze_norm=False,
                 data_format='NCHW'):
        """
        YOLOv3FPN layer

        Args:
            in_channels (list): input channels for fpn
            norm_type (str): batch norm type, default bn
            data_format (str): data format, NCHW or NHWC

        """
        super(YOLOv3FPN, self).__init__()
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels
        self.num_blocks = len(in_channels)

        self._out_channels = []
        self.yolo_blocks = []
        self.routes = []
        self.data_format = data_format
        for i in range(self.num_blocks):
            name = 'yolo_block.{}'.format(i)
            in_channel = in_channels[-i - 1]
            if i > 0:
                in_channel += 512 // (2**i)
            yolo_block = self.add_sublayer(
                name,
                YoloDetBlock(
                    in_channel,
                    channel=512 // (2**i),
                    norm_type=norm_type,
                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name))
            self.yolo_blocks.append(yolo_block)
            # tip layer output channel doubled
            self._out_channels.append(1024 // (2**i))

            if i < self.num_blocks - 1:
                name = 'yolo_transition.{}'.format(i)
                route = self.add_sublayer(
                    name,
                    ConvBNLayer(
                        ch_in=512 // (2**i),
                        ch_out=256 // (2**i),
                        filter_size=1,
                        stride=1,
                        padding=0,
                        norm_type=norm_type,
                        freeze_norm=freeze_norm,
                        data_format=data_format,
                        name=name))
                self.routes.append(route)

    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        yolo_feats = []

        # add embedding features output for multi-object tracking model
        if for_mot:
            emb_feats = []

        for i, block in enumerate(blocks):
            if i > 0:
                if self.data_format == 'NCHW':
                    block = paddle.concat([route, block], axis=1)
                else:
                    block = paddle.concat([route, block], axis=-1)
            route, tip = self.yolo_blocks[i](block)
            yolo_feats.append(tip)

            if for_mot:
                # add embedding features output
                emb_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

        if for_mot:
            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
        else:
            return yolo_feats

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


@register
@serializable
class PPYOLOFPN(nn.Layer):
    __shared__ = ['norm_type', 'data_format']

    def __init__(self,
                 in_channels=[512, 1024, 2048],
                 norm_type='bn',
                 freeze_norm=False,
                 data_format='NCHW',
                 coord_conv=False,
                 conv_block_num=2,
                 drop_block=False,
                 block_size=3,
                 keep_prob=0.9,
                 spp=False):
        """
        PPYOLOFPN layer

        Args:
            in_channels (list): input channels for fpn
            norm_type (str): batch norm type, default bn
            data_format (str): data format, NCHW or NHWC
            coord_conv (bool): whether use CoordConv or not
            conv_block_num (int): conv block num of each pan block
            drop_block (bool): whether use DropBlock or not
            block_size (int): block size of DropBlock
            keep_prob (float): keep probability of DropBlock
            spp (bool): whether use spp or not

        """
        super(PPYOLOFPN, self).__init__()
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels
        self.num_blocks = len(in_channels)
        # parse kwargs
        self.coord_conv = coord_conv
        self.drop_block = drop_block
        self.block_size = block_size
        self.keep_prob = keep_prob
        self.spp = spp
        self.conv_block_num = conv_block_num
        self.data_format = data_format
        if self.coord_conv:
            ConvLayer = CoordConv
        else:
            ConvLayer = ConvBNLayer

        if self.drop_block:
            dropblock_cfg = [[
                'dropblock', DropBlock, [self.block_size, self.keep_prob],
                dict()
            ]]
        else:
            dropblock_cfg = []

        self._out_channels = []
        self.yolo_blocks = []
        self.routes = []
        for i, ch_in in enumerate(self.in_channels[::-1]):
            if i > 0:
                ch_in += 512 // (2**i)
            channel = 64 * (2**self.num_blocks) // (2**i)
            base_cfg = []
            c_in, c_out = ch_in, channel
            for j in range(self.conv_block_num):
                base_cfg += [
                    [
                        'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1],
                        dict(
                            padding=0,
                            norm_type=norm_type,
                            freeze_norm=freeze_norm)
                    ],
                    [
                        'conv{}'.format(2 * j + 1), ConvBNLayer,
                        [c_out, c_out * 2, 3], dict(
                            padding=1,
                            norm_type=norm_type,
                            freeze_norm=freeze_norm)
                    ],
                ]
                c_in, c_out = c_out * 2, c_out

            base_cfg += [[
                'route', ConvLayer, [c_in, c_out, 1], dict(
                    padding=0, norm_type=norm_type, freeze_norm=freeze_norm)
            ], [
                'tip', ConvLayer, [c_out, c_out * 2, 3], dict(
                    padding=1, norm_type=norm_type, freeze_norm=freeze_norm)
            ]]

            if self.conv_block_num == 2:
                if i == 0:
                    if self.spp:
                        spp_cfg = [[
                            'spp', SPP, [channel * 4, channel, 1], dict(
                                pool_size=[5, 9, 13],
                                norm_type=norm_type,
                                freeze_norm=freeze_norm)
                        ]]
                    else:
                        spp_cfg = []
                    cfg = base_cfg[0:3] + spp_cfg + base_cfg[
                        3:4] + dropblock_cfg + base_cfg[4:6]
                else:
                    cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6]
            elif self.conv_block_num == 0:
                if self.spp and i == 0:
                    spp_cfg = [[
                        'spp', SPP, [c_in * 4, c_in, 1], dict(
                            pool_size=[5, 9, 13],
                            norm_type=norm_type,
                            freeze_norm=freeze_norm)
                    ]]
                else:
                    spp_cfg = []
                cfg = spp_cfg + dropblock_cfg + base_cfg
            name = 'yolo_block.{}'.format(i)
            yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name))
            self.yolo_blocks.append(yolo_block)
            self._out_channels.append(channel * 2)
            if i < self.num_blocks - 1:
                name = 'yolo_transition.{}'.format(i)
                route = self.add_sublayer(
                    name,
                    ConvBNLayer(
                        ch_in=channel,
                        ch_out=256 // (2**i),
                        filter_size=1,
                        stride=1,
                        padding=0,
                        norm_type=norm_type,
                        freeze_norm=freeze_norm,
                        data_format=data_format,
                        name=name))
                self.routes.append(route)

    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        yolo_feats = []

        # add embedding features output for multi-object tracking model
        if for_mot:
            emb_feats = []

        for i, block in enumerate(blocks):
            if i > 0:
                if self.data_format == 'NCHW':
                    block = paddle.concat([route, block], axis=1)
                else:
                    block = paddle.concat([route, block], axis=-1)
            route, tip = self.yolo_blocks[i](block)
            yolo_feats.append(tip)

            if for_mot:
                # add embedding features output
                emb_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

        if for_mot:
            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
        else:
            return yolo_feats

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


@register
@serializable
class PPYOLOTinyFPN(nn.Layer):
    __shared__ = ['norm_type', 'data_format']

    def __init__(self,
                 in_channels=[80, 56, 34],
                 detection_block_channels=[160, 128, 96],
                 norm_type='bn',
                 data_format='NCHW',
                 **kwargs):
        """
        PPYOLO Tiny FPN layer
        Args:
            in_channels (list): input channels for fpn
            detection_block_channels (list): channels in fpn
            norm_type (str): batch norm type, default bn
            data_format (str): data format, NCHW or NHWC
            kwargs: extra key-value pairs, such as parameter of DropBlock and spp 
        """
        super(PPYOLOTinyFPN, self).__init__()
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels[::-1]
        assert len(detection_block_channels
                   ) > 0, "detection_block_channelslength should > 0"
        self.detection_block_channels = detection_block_channels
        self.data_format = data_format
        self.num_blocks = len(in_channels)
        # parse kwargs
        self.drop_block = kwargs.get('drop_block', False)
        self.block_size = kwargs.get('block_size', 3)
        self.keep_prob = kwargs.get('keep_prob', 0.9)

        self.spp_ = kwargs.get('spp', False)
        if self.spp_:
            self.spp = SPP(self.in_channels[0] * 4,
                           self.in_channels[0],
                           k=1,
                           pool_size=[5, 9, 13],
                           norm_type=norm_type,
                           name='spp')

        self._out_channels = []
        self.yolo_blocks = []
        self.routes = []
        for i, (
                ch_in, ch_out
        ) in enumerate(zip(self.in_channels, self.detection_block_channels)):
            name = 'yolo_block.{}'.format(i)
            if i > 0:
                ch_in += self.detection_block_channels[i - 1]
            yolo_block = self.add_sublayer(
                name,
                PPYOLOTinyDetBlock(
                    ch_in,
                    ch_out,
                    name,
                    drop_block=self.drop_block,
                    block_size=self.block_size,
                    keep_prob=self.keep_prob))
            self.yolo_blocks.append(yolo_block)
            self._out_channels.append(ch_out)

            if i < self.num_blocks - 1:
                name = 'yolo_transition.{}'.format(i)
                route = self.add_sublayer(
                    name,
                    ConvBNLayer(
                        ch_in=ch_out,
                        ch_out=ch_out,
                        filter_size=1,
                        stride=1,
                        padding=0,
                        norm_type=norm_type,
                        data_format=data_format,
                        name=name))
                self.routes.append(route)

    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        yolo_feats = []

        # add embedding features output for multi-object tracking model
        if for_mot:
            emb_feats = []

        for i, block in enumerate(blocks):
            if i == 0 and self.spp_:
                block = self.spp(block)

            if i > 0:
                if self.data_format == 'NCHW':
                    block = paddle.concat([route, block], axis=1)
                else:
                    block = paddle.concat([route, block], axis=-1)
            route, tip = self.yolo_blocks[i](block)
            yolo_feats.append(tip)

            if for_mot:
                # add embedding features output
                emb_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

        if for_mot:
            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
        else:
            return yolo_feats

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


@register
@serializable
class PPYOLOPAN(nn.Layer):
    __shared__ = ['norm_type', 'data_format']

    def __init__(self,
                 in_channels=[512, 1024, 2048],
                 norm_type='bn',
                 data_format='NCHW',
                 act='mish',
                 conv_block_num=3,
                 drop_block=False,
                 block_size=3,
                 keep_prob=0.9,
                 spp=False):
        """
        PPYOLOPAN layer with SPP, DropBlock and CSP connection.

        Args:
            in_channels (list): input channels for fpn
            norm_type (str): batch norm type, default bn
            data_format (str): data format, NCHW or NHWC
            act (str): activation function, default mish
            conv_block_num (int): conv block num of each pan block
            drop_block (bool): whether use DropBlock or not
            block_size (int): block size of DropBlock
            keep_prob (float): keep probability of DropBlock
            spp (bool): whether use spp or not

        """
        super(PPYOLOPAN, self).__init__()
        assert len(in_channels) > 0, "in_channels length should > 0"
        self.in_channels = in_channels
        self.num_blocks = len(in_channels)
        # parse kwargs
        self.drop_block = drop_block
        self.block_size = block_size
        self.keep_prob = keep_prob
        self.spp = spp
        self.conv_block_num = conv_block_num
        self.data_format = data_format
        if self.drop_block:
            dropblock_cfg = [[
                'dropblock', DropBlock, [self.block_size, self.keep_prob],
                dict()
            ]]
        else:
            dropblock_cfg = []

        # fpn
        self.fpn_blocks = []
        self.fpn_routes = []
        fpn_channels = []
        for i, ch_in in enumerate(self.in_channels[::-1]):
            if i > 0:
                ch_in += 512 // (2**(i - 1))
            channel = 512 // (2**i)
            base_cfg = []
            for j in range(self.conv_block_num):
                base_cfg += [
                    # name, layer, args
                    [
                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
                        dict(
                            padding=0, act=act, norm_type=norm_type)
                    ],
                    [
                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
                        dict(
                            padding=1, act=act, norm_type=norm_type)
                    ]
                ]

            if i == 0 and self.spp:
                base_cfg[3] = [
                    'spp', SPP, [channel * 4, channel, 1], dict(
                        pool_size=[5, 9, 13], act=act, norm_type=norm_type)
                ]

            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
            name = 'fpn.{}'.format(i)
            fpn_block = self.add_sublayer(
                name,
                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
                                  data_format))
            self.fpn_blocks.append(fpn_block)
            fpn_channels.append(channel * 2)
            if i < self.num_blocks - 1:
                name = 'fpn_transition.{}'.format(i)
                route = self.add_sublayer(
                    name,
                    ConvBNLayer(
                        ch_in=channel * 2,
                        ch_out=channel,
                        filter_size=1,
                        stride=1,
                        padding=0,
                        act=act,
                        norm_type=norm_type,
                        data_format=data_format,
                        name=name))
                self.fpn_routes.append(route)
        # pan
        self.pan_blocks = []
        self.pan_routes = []
        self._out_channels = [512 // (2**(self.num_blocks - 2)), ]
        for i in reversed(range(self.num_blocks - 1)):
            name = 'pan_transition.{}'.format(i)
            route = self.add_sublayer(
                name,
                ConvBNLayer(
                    ch_in=fpn_channels[i + 1],
                    ch_out=fpn_channels[i + 1],
                    filter_size=3,
                    stride=2,
                    padding=1,
                    act=act,
                    norm_type=norm_type,
                    data_format=data_format,
                    name=name))
            self.pan_routes = [route, ] + self.pan_routes
            base_cfg = []
            ch_in = fpn_channels[i] + fpn_channels[i + 1]
            channel = 512 // (2**i)
            for j in range(self.conv_block_num):
                base_cfg += [
                    # name, layer, args
                    [
                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
                        dict(
                            padding=0, act=act, norm_type=norm_type)
                    ],
                    [
                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
                        dict(
                            padding=1, act=act, norm_type=norm_type)
                    ]
                ]

            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
            name = 'pan.{}'.format(i)
            pan_block = self.add_sublayer(
                name,
                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
                                  data_format))

            self.pan_blocks = [pan_block, ] + self.pan_blocks
            self._out_channels.append(channel * 2)

        self._out_channels = self._out_channels[::-1]

    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        fpn_feats = []

        # add embedding features output for multi-object tracking model
        if for_mot:
            emb_feats = []

        for i, block in enumerate(blocks):
            if i > 0:
                if self.data_format == 'NCHW':
                    block = paddle.concat([route, block], axis=1)
                else:
                    block = paddle.concat([route, block], axis=-1)
            route, tip = self.fpn_blocks[i](block)
            fpn_feats.append(tip)

            if for_mot:
                # add embedding features output
                emb_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.fpn_routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

        pan_feats = [fpn_feats[-1], ]
        route = fpn_feats[self.num_blocks - 1]
        for i in reversed(range(self.num_blocks - 1)):
            block = fpn_feats[i]
            route = self.pan_routes[i](route)
            if self.data_format == 'NCHW':
                block = paddle.concat([route, block], axis=1)
            else:
                block = paddle.concat([route, block], axis=-1)

            route, tip = self.pan_blocks[i](block)
            pan_feats.append(tip)

        if for_mot:
            return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats}
        else:
            return pan_feats[::-1]

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


@register
@serializable
class YOLOCSPPAN(nn.Layer):
    """
    YOLO CSP-PAN, used in YOLOv5 and YOLOX.
    """
    __shared__ = ['depth_mult', 'data_format', 'act', 'trt']

    def __init__(self,
                 depth_mult=1.0,
                 in_channels=[256, 512, 1024],
                 depthwise=False,
                 data_format='NCHW',
                 act='silu',
                 trt=False):
        super(YOLOCSPPAN, self).__init__()
        self.in_channels = in_channels
        self._out_channels = in_channels
        Conv = DWConv if depthwise else BaseConv

        self.data_format = data_format
        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")

        # top-down fpn
        self.lateral_convs = nn.LayerList()
        self.fpn_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.lateral_convs.append(
                BaseConv(
                    int(in_channels[idx]),
                    int(in_channels[idx - 1]),
                    1,
                    1,
                    act=act))
            self.fpn_blocks.append(
                CSPLayer(
                    int(in_channels[idx - 1] * 2),
                    int(in_channels[idx - 1]),
                    round(3 * depth_mult),
                    shortcut=False,
                    depthwise=depthwise,
                    act=act))

        # bottom-up pan
        self.downsample_convs = nn.LayerList()
        self.pan_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1):
            self.downsample_convs.append(
                Conv(
                    int(in_channels[idx]),
                    int(in_channels[idx]),
                    3,
                    stride=2,
                    act=act))
            self.pan_blocks.append(
                CSPLayer(
                    int(in_channels[idx] * 2),
                    int(in_channels[idx + 1]),
                    round(3 * depth_mult),
                    shortcut=False,
                    depthwise=depthwise,
                    act=act))

    def forward(self, feats, for_mot=False):
        assert len(feats) == len(self.in_channels)

        # top-down fpn
        inner_outs = [feats[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = feats[idx - 1]
            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
                feat_heigh)
            inner_outs[0] = feat_heigh

            upsample_feat = F.interpolate(
                feat_heigh,
                scale_factor=2.,
                mode="nearest",
                data_format=self.data_format)
            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                paddle.concat(
                    [upsample_feat, feat_low], axis=1))
            inner_outs.insert(0, inner_out)

        # bottom-up pan
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsample_convs[idx](feat_low)
            out = self.pan_blocks[idx](paddle.concat(
                [downsample_feat, feat_height], axis=1))
            outs.append(out)

        return outs

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_channels': [i.channels for i in input_shape], }

    @property
    def out_shape(self):
        return [ShapeSpec(channels=c) for c in self._out_channels]


================================================
FILE: ppdet/modeling/ops.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
try:
    import paddle._legacy_C_ops as C_ops
except:
    import paddle._C_ops as C_ops

try:
    from paddle.framework import in_dynamic_or_pir_mode
    HAVE_PIR = True
except:
    HAVE_PIR = False

from paddle import in_dynamic_mode
from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype

__all__ = [
    'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms',
    'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu',
    'swish', 'identity', 'anchor_generator'
]


def identity(x):
    return x


def mish(x):
    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))


def silu(x):
    return F.silu(x)


def swish(x):
    return x * F.sigmoid(x)


TRT_ACT_SPEC = {'swish': swish, 'silu': swish}

ACT_SPEC = {'mish': mish, 'silu': silu}


def get_act_fn(act=None, trt=False):
    assert act is None or isinstance(act, (
        str, dict)), 'name of activation should be str, dict or None'
    if not act:
        return identity

    if isinstance(act, dict):
        name = act['name']
        act.pop('name')
        kwargs = act
    else:
        name = act
        kwargs = dict()

    if trt and name in TRT_ACT_SPEC:
        fn = TRT_ACT_SPEC[name]
    elif name in ACT_SPEC:
        fn = ACT_SPEC[name]
    else:
        fn = getattr(F, name)

    return lambda x: fn(x, **kwargs)


def batch_norm(ch,
               norm_type='bn',
               norm_decay=0.,
               freeze_norm=False,
               initializer=None,
               data_format='NCHW'):

    norm_lr = 0. if freeze_norm else 1.
    weight_attr = ParamAttr(
        initializer=initializer,
        learning_rate=norm_lr,
        regularizer=L2Decay(norm_decay),
        trainable=False if freeze_norm else True)
    bias_attr = ParamAttr(
        learning_rate=norm_lr,
        regularizer=L2Decay(norm_decay),
        trainable=False if freeze_norm else True)

    if norm_type in ['sync_bn', 'bn']:
        norm_layer = nn.BatchNorm2D(
            ch,
            weight_attr=weight_attr,
            bias_attr=bias_attr,
            data_format=data_format)

    norm_params = norm_layer.parameters()
    if freeze_norm:
        for param in norm_params:
            param.stop_gradient = True

    return norm_layer


@paddle.jit.not_to_static
def anchor_generator(input,
                     anchor_sizes=None,
                     aspect_ratios=None,
                     variance=[0.1, 0.1, 0.2, 0.2],
                     stride=None,
                     offset=0.5):
    """
    **Anchor generator operator**
    Generate anchors for Faster RCNN algorithm.
    Each position of the input produce N anchors, N =
    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
    is firstly aspect_ratios loop then anchor_sizes loop.
    Args:
       input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.
       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
          For instance, the anchor size of 64 means the area of this anchor 
          equals to 64**2. None by default.
       aspect_ratios(float32|list|tuple, optional): The height / width ratios 
           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
       variance(list|tuple, optional): The variances to be used in box 
           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by 
           default.
       stride(list|tuple, optional): The anchors stride across width and height.
           The data type is float32. e.g. [16.0, 16.0]. None by default.
       offset(float32, optional): Prior boxes center offset. 0.5 by default.
    Returns:
        Tuple:
        Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
        H is the height of input, W is the width of input,
        num_anchors is the box count of each position. 
        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
 
        Variances(Variable): The expanded variances of anchors
        with a layout of [H, W, num_priors, 4].
        H is the height of input, W is the width of input
        num_anchors is the box count of each position.
        Each variance is in (xcenter, ycenter, w, h) format.
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')
            anchor, var = fluid.layers.anchor_generator(
                input=conv1,
                anchor_sizes=[64, 128, 256, 512],
                aspect_ratios=[0.5, 1.0, 2.0],
                variance=[0.1, 0.1, 0.2, 0.2],
                stride=[16.0, 16.0],
                offset=0.5)
    """

    def _is_list_or_tuple_(data):
        return (isinstance(data, list) or isinstance(data, tuple))

    if not _is_list_or_tuple_(anchor_sizes):
        anchor_sizes = [anchor_sizes]
    if not _is_list_or_tuple_(aspect_ratios):
        aspect_ratios = [aspect_ratios]
    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
        raise ValueError('stride should be a list or tuple ',
                         'with length 2, (stride_width, stride_height).')

    anchor_sizes = list(map(float, anchor_sizes))
    aspect_ratios = list(map(float, aspect_ratios))
    stride = list(map(float, stride))

    if in_dynamic_mode():
        attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios,
                 'variances', variance, 'stride', stride, 'offset', offset)
        anchor, var = C_ops.anchor_generator(input, *attrs)
        return anchor, var

    helper = LayerHelper("anchor_generator", **locals())
    dtype = helper.input_dtype()
    attrs = {
        'anchor_sizes': anchor_sizes,
        'aspect_ratios': aspect_ratios,
        'variances': variance,
        'stride': stride,
        'offset': offset
    }

    anchor = helper.create_variable_for_type_inference(dtype)
    var = helper.create_variable_for_type_inference(dtype)
    helper.append_op(
        type="anchor_generator",
        inputs={"Input": input},
        outputs={"Anchors": anchor,
                 "Variances": var},
        attrs=attrs, )
    anchor.stop_gradient = True
    var.stop_gradient = True
    return anchor, var


@paddle.jit.not_to_static
def distribute_fpn_proposals(fpn_rois,
                             min_level,
                             max_level,
                             refer_level,
                             refer_scale,
                             pixel_offset=False,
                             rois_num=None,
                             name=None):
    r"""
    
    **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
    (FPN) models, it is needed to distribute all proposals into different FPN 
    level, with respect to scale of the proposals, the referring scale and the 
    referring level. Besides, to restore the order of proposals, we return an 
    array which indicates the original index of rois in current proposals. 
    To compute FPN level for each roi, the formula is given as follows:
    
    .. math::

        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}

        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)

    where BBoxArea is a function to compute the area of each roi.

    Args:

        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is 
            float32 or float64. The input fpn_rois.
        min_level(int32): The lowest level of FPN layer where the proposals come 
            from.
        max_level(int32): The highest level of FPN layer where the proposals
            come from.
        refer_level(int32): The referring level of FPN layer with specified scale.
        refer_scale(int32): The referring scale of FPN layer with specified level.
        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
            The shape is [B] and data type is int32. B is the number of images.
            If it is not None then return a list of 1-D Tensor. Each element 
            is the output RoIs' number of each image on the corresponding level
            and the shape is [B]. None by default.
        name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default. 

    Returns:
        Tuple:

        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] 
        and data type of float32 and float64. The length is 
        max_level-min_level+1. The proposals in each FPN level.

        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is 
        the number of total rois. The data type is int32. It is
        used to restore the order of fpn_rois.

        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
        the RoIs' number in each image on the corresponding level. The shape 
        is [B] and data type of int32. B is the number of images


    Examples:
        .. code-block:: python

            import paddle
            from ppdet.modeling import ops
            paddle.enable_static()
            fpn_rois = paddle.static.data(
                name='data', shape=[None, 4], dtype='float32', lod_level=1)
            multi_rois, restore_ind = ops.distribute_fpn_proposals(
                fpn_rois=fpn_rois,
                min_level=2,
                max_level=5,
                refer_level=4,
                refer_scale=224)
    """
    num_lvl = max_level - min_level + 1

    if in_dynamic_mode():
        assert rois_num is not None, "rois_num should not be None in dygraph mode."
        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
                 pixel_offset)
        multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals(
            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)

        return multi_rois, restore_ind, rois_num_per_level

    else:
        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
                                 'distribute_fpn_proposals')
        helper = LayerHelper('distribute_fpn_proposals', **locals())
        dtype = helper.input_dtype('fpn_rois')
        multi_rois = [
            helper.create_variable_for_type_inference(dtype)
            for i in range(num_lvl)
        ]

        restore_ind = helper.create_variable_for_type_inference(dtype='int32')

        inputs = {'FpnRois': fpn_rois}
        outputs = {
            'MultiFpnRois': multi_rois,
            'RestoreIndex': restore_ind,
        }

        if rois_num is not None:
            inputs['RoisNum'] = rois_num
            rois_num_per_level = [
                helper.create_variable_for_type_inference(dtype='int32')
                for i in range(num_lvl)
            ]
            outputs['MultiLevelRoIsNum'] = rois_num_per_level
        else:
            rois_num_per_level = None

        helper.append_op(
            type='distribute_fpn_proposals',
            inputs=inputs,
            outputs=outputs,
            attrs={
                'min_level': min_level,
                'max_level': max_level,
                'refer_level': refer_level,
                'refer_scale': refer_scale,
                'pixel_offset': pixel_offset
            })
        return multi_rois, restore_ind, rois_num_per_level


@paddle.jit.not_to_static
def prior_box(input,
              image,
              min_sizes,
              max_sizes=None,
              aspect_ratios=[1.],
              variance=[0.1, 0.1, 0.2, 0.2],
              flip=False,
              clip=False,
              steps=[0.0, 0.0],
              offset=0.5,
              min_max_aspect_ratios_order=False,
              name=None):
    """

    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
    Each position of the input produce N prior boxes, N is determined by
    the count of min_sizes, max_sizes and aspect_ratios, The size of the
    box is in range(min_size, max_size) interval, which is generated in
    sequence according to the aspect_ratios.

    Parameters:
       input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
       image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
            the data type should be float32 or float64.
       min_sizes(list|tuple|float): the min sizes of generated prior boxes.
       max_sizes(list|tuple|None): the max sizes of generated prior boxes.
            Default: None.
       aspect_ratios(list|tuple|float): the aspect ratios of generated
            prior boxes. Default: [1.].
       variance(list|tuple): the variances to be encoded in prior boxes.
            Default:[0.1, 0.1, 0.2, 0.2].
       flip(bool): Whether to flip aspect ratios. Default:False.
       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
       step(list|tuple): Prior boxes step across width and height, If
            step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across
            height or weight of the input will be automatically calculated.
            Default: [0., 0.]
       offset(float): Prior boxes center offset. Default: 0.5
       min_max_aspect_ratios_order(bool): If set True, the output prior box is
            in order of [min, max, aspect_ratios], which is consistent with
            Caffe. Please note, this order affects the weights order of
            convolution layer followed by and does not affect the final
            detection results. Default: False.
       name(str, optional): The default value is None.  Normally there is no need for 
            user to set this property. For more information, please refer to :ref:`api_guide_Name`

    Returns:
        Tuple: A tuple with two Variable (boxes, variances)

        boxes(Tensor): the output prior boxes of PriorBox.
        4-D tensor, the layout is [H, W, num_priors, 4].
        H is the height of input, W is the width of input,
        num_priors is the total box count of each position of input.

        variances(Tensor): the expanded variances of PriorBox.
        4-D tensor, the layput is [H, W, num_priors, 4].
        H is the height of input, W is the width of input
        num_priors is the total box count of each position of input

    Examples:
        .. code-block:: python

        import paddle
        from ppdet.modeling import ops

        paddle.enable_static()
        input = paddle.static.data(name="input", shape=[None,3,6,9])
        image = paddle.static.data(name="image", shape=[None,3,9,12])
        box, var = ops.prior_box(
                    input=input,
                    image=image,
                    min_sizes=[100.],
                    clip=True,
                    flip=True)
    """
    return paddle.vision.ops.prior_box(
        input,
        image,
        min_sizes,
        max_sizes,
        aspect_ratios,
        variance,
        flip,
        clip,
        steps,
        offset,
        min_max_aspect_ratios_order,
        name,
    )


@paddle.jit.not_to_static
def multiclass_nms(bboxes,
                   scores,
                   score_threshold,
                   nms_top_k,
                   keep_top_k,
                   nms_threshold=0.3,
                   normalized=True,
                   nms_eta=1.,
                   background_label=-1,
                   return_index=False,
                   return_rois_num=True,
                   rois_num=None,
                   name=None):
    """
    This operator is to do multi-class non maximum suppression (NMS) on
    boxes and scores.
    In the NMS step, this operator greedily selects a subset of detection bounding
    boxes that have high scores larger than score_threshold, if providing this
    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
    is larger than -1. Then this operator pruns away boxes that have high IOU
    (intersection over union) overlap with already selected boxes by adaptive
    threshold NMS based on parameters of nms_threshold and nms_eta.
    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
    Args:
        bboxes (Tensor): Two types of bboxes are supported:
                           1. (Tensor) A 3-D Tensor with shape
                           [N, M, 4 or 8 16 24 32] represents the
                           predicted locations of M bounding bboxes,
                           N is the batch size. Each bounding box has four
                           coordinate values and the layout is
                           [xmin, ymin, xmax, ymax], when box size equals to 4.
                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
                           M is the number of bounding boxes, C is the
                           class number
        scores (Tensor): Two types of scores are supported:
                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
                           represents the predicted confidence predictions.
                           N is the batch size, C is the class number, M is
                           number of bounding boxes. For each category there
                           are total M scores which corresponding M bounding
                           boxes. Please note, M is equal to the 2nd dimension
                           of BBoxes.
                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
                           M is the number of bbox, C is the class number.
                           In this case, input BBoxes should be the second
                           case with shape [M, C, 4].
        background_label (int): The index of background label, the background
                                label will be ignored. If set to -1, then all
                                categories will be considered. Default: 0
        score_threshold (float): Threshold to filter out bounding boxes with
                                 low confidence score. If not provided,
                                 consider all boxes.
        nms_top_k (int): Maximum number of detections to be kept according to
                         the confidences after the filtering detections based
                         on score_threshold.
        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
        nms_eta (float): The threshold to be used in NMS. Default: 1.0
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
        normalized (bool): Whether detections are normalized. Default: True
        return_index(bool): Whether return selected index. Default: False
        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
            The shape is [B] and data type is int32. B is the number of images.
            If it is not None then return a list of 1-D Tensor. Each element 
            is the output RoIs' number of each image on the corresponding level
            and the shape is [B]. None by default.
        name(str): Name of the multiclass nms op. Default: None.
    Returns:
        A tuple with two Variables: (Out, Index) if return_index is True,
        otherwise, a tuple with one Variable(Out) is returned.
        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
        or A 2-D LoDTensor with shape [No, 10] represents the detections.
        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
        x4, y4]. No is the total number of detections.
        If all images have not detected results, all elements in LoD will be
        0, and output tensor is empty (None).
        Index: Only return when return_index is True. A 2-D LoDTensor with
        shape [No, 1] represents the selected index which type is Integer.
        The index is the absolute value cross batches. No is the same number
        as Out. If the index is used to gather other attribute such as age,
        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
        N is the batch size and M is the number of boxes.
    Examples:
        .. code-block:: python

            import paddle
            from ppdet.modeling import ops
            boxes = paddle.static.data(name='bboxes', shape=[81, 4],
                                      dtype='float32', lod_level=1)
            scores = paddle.static.data(name='scores', shape=[81],
                                      dtype='float32', lod_level=1)
            out, index = ops.multiclass_nms(bboxes=boxes,
                                            scores=scores,
                                            background_label=0,
                                            score_threshold=0.5,
                                            nms_top_k=400,
                                            nms_threshold=0.3,
                                            keep_top_k=200,
                                            normalized=False,
                                            return_index=True)
    """
    helper = LayerHelper('multiclass_nms3', **locals())

    if HAVE_PIR and in_dynamic_or_pir_mode():
        # https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/ops/yaml/ops.yaml#L3175
        attrs = (score_threshold, nms_top_k, keep_top_k, nms_threshold, normalized, nms_eta, background_label, )
        output, index, nms_rois_num = paddle._C_ops.multiclass_nms3(bboxes, scores, rois_num, *attrs)

        if not return_index:
            index = None
        return output, nms_rois_num, index

    elif in_dynamic_mode():
        attrs = ('background_label', background_label, 'score_threshold',
                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
                 'normalized', normalized)
        output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores,
                                                            rois_num, *attrs)
        if not return_index:
            index = None
        return output, nms_rois_num, index
        
    else:
        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
        index = helper.create_variable_for_type_inference(dtype='int32')

        inputs = {'BBoxes': bboxes, 'Scores': scores}
        outputs = {'Out': output, 'Index': index}

        if rois_num is not None:
            inputs['RoisNum'] = rois_num

        if return_rois_num:
            nms_rois_num = helper.create_variable_for_type_inference(
                dtype='int32')
            outputs['NmsRoisNum'] = nms_rois_num

        helper.append_op(
            type="multiclass_nms3",
            inputs=inputs,
            attrs={
                'background_label': background_label,
                'score_threshold': score_threshold,
                'nms_top_k': nms_top_k,
                'nms_threshold': nms_threshold,
                'keep_top_k': keep_top_k,
                'nms_eta': nms_eta,
                'normalized': normalized
            },
            outputs=outputs)
        output.stop_gradient = True
        index.stop_gradient = True
        if not return_index:
            index = None
        if not return_rois_num:
            nms_rois_num = None

        return output, nms_rois_num, index


@paddle.jit.not_to_static
def matrix_nms(bboxes,
               scores,
               score_threshold,
               post_threshold,
               nms_top_k,
               keep_top_k,
               use_gaussian=False,
               gaussian_sigma=2.,
               background_label=0,
               normalized=True,
               return_index=False,
               return_rois_num=True,
               name=None):
    """
    **Matrix NMS**
    This operator does matrix non maximum suppression (NMS).
    First selects a subset of candidate bounding boxes that have higher scores
    than score_threshold (if provided), then the top k candidate is selected if
    nms_top_k is larger than -1. Score of the remaining candidate are then
    decayed according to the Matrix NMS scheme.
    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
    Args:
        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                           predicted locations of M bounding bboxes,
                           N is the batch size. Each bounding box has four
                           coordinate values and the layout is
                           [xmin, ymin, xmax, ymax], when box size equals to 4.
                           The data type is float32 or float64.
        scores (Tensor): A 3-D Tensor with shape [N, C, M]
                           represents the predicted confidence predictions.
                           N is the batch size, C is the class number, M is
                           number of bounding boxes. For each category there
                           are total M scores which corresponding M bounding
                           boxes. Please note, M is equal to the 2nd dimension
                           of BBoxes. The data type is float32 or float64.
        score_threshold (float): Threshold to filter out bounding boxes with
                                 low confidence score.
        post_threshold (float): Threshold to filter out bounding boxes with
                                low confidence score AFTER decaying.
        nms_top_k (int): Maximum number of detections to be kept according to
                         the confidences after the filtering detections based
                         on score_threshold.
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
        use_gaussian (bool): Use Gaussian as the decay function. Default: False
        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
        background_label (int): The index of background label, the background
                                label will be ignored. If set to -1, then all
                                categories will be considered. Default: 0
        normalized (bool): Whether detections are normalized. Default: True
        return_index(bool): Whether return selected index. Default: False
        return_rois_num(bool): whether return rois_num. Default: True
        name(str): Name of the matrix nms op. Default: None.
    Returns:
        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
             detection results.
             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
             (After version 1.3, when no boxes detected, the lod is changed
             from {0} to {1})
        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
            selected indices, which are absolute values cross batches.
        rois_num (Tensor): A 1-D Tensor with shape [N] containing 
            the number of detected boxes in each image.
    Examples:
        .. code-block:: python
            import paddle
            from ppdet.modeling import ops
            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
                                      dtype='float32', lod_level=1)
            scores = paddle.static.data(name='scores', shape=[None,81],
                                      dtype='float32', lod_level=1)
            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                 score_threshold=0.5, post_threshold=0.1,
                                 nms_top_k=400, keep_top_k=200, normalized=False)
    """
    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
                             'matrix_nms')
    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
                             'matrix_nms')
    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
    check_type(normalized, 'normalized', bool, 'matrix_nms')
    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
    check_type(background_label, 'background_label', int, 'matrix_nms')

    if in_dynamic_mode():
        attrs = ('background_label', background_label, 'score_threshold',
                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
                 normalized)
        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)
        if not return_index:
            index = None
        if not return_rois_num:
            rois_num = None
        return out, rois_num, index
    else:
        helper = LayerHelper('matrix_nms', **locals())
        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
        index = helper.create_variable_for_type_inference(dtype='int32')
        outputs = {'Out': output, 'Index': index}
        if return_rois_num:
            rois_num = helper.create_variable_for_type_inference(dtype='int32')
            outputs['RoisNum'] = rois_num

        helper.append_op(
            type="matrix_nms",
            inputs={'BBoxes': bboxes,
                    'Scores': scores},
            attrs={
                'background_label': background_label,
                'score_threshold': score_threshold,
                'post_threshold': post_threshold,
                'nms_top_k': nms_top_k,
                'gaussian_sigma': gaussian_sigma,
                'use_gaussian': use_gaussian,
                'keep_top_k': keep_top_k,
                'normalized': normalized
            },
            outputs=outputs)
        output.stop_gradient = True

        if not return_index:
            index = None
        if not return_rois_num:
            rois_num = None
        return output, rois_num, index


@paddle.jit.not_to_static
def box_coder(prior_box,
              prior_box_var,
              target_box,
              code_type="encode_center_size",
              box_normalized=True,
              axis=0,
              name=None):
    r"""
    **Box Coder Layer**
    Encode/Decode the target bounding box with the priorbox information.
    
    The Encoding schema described below:
    .. math::
        ox = (tx - px) / pw / pxv
        oy = (ty - py) / ph / pyv
        ow = \log(\abs(tw / pw)) / pwv 
        oh = \log(\abs(th / ph)) / phv 
    The Decoding schema described below:
    
    .. math::
  
        ox = (pw * pxv * tx * + px) - tw / 2
        oy = (ph * pyv * ty * + py) - th / 2
        ow = \exp(pwv * tw) * pw + tw / 2
        oh = \exp(phv * th) * ph + th / 2   
    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
    During Box Decoding, two modes for broadcast are supported. Say target 
    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
    [M, 4]. Then prior box will broadcast to target box along the 
    assigned axis. 

    Args:
        prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape 
            [M, 4] holds M boxes and data type is float32 or float64. Each box
            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
            left top coordinate of the anchor box, if the input is image feature
            map, they are close to the origin of the coordinate system. 
            [xmax, ymax] is the right bottom coordinate of the anchor box.       
        prior_box_var(List|Tensor|None): prior_box_var supports three types 
            of input. One is Tensor with shape [M, 4] which holds M group and 
            data type is float32 or float64. The second is list consist of 
            4 elements shared by all boxes and data type is float32 or float64. 
            Other is None and not involved in calculation. 
        target_box(Tensor): This input can be a 2-D LoDTensor with shape 
            [N, 4] when code_type is 'encode_center_size'. This input also can 
            be a 3-D Tensor with shape [N, M, 4] when code_type is 
            'decode_center_size'. Each box is represented as 
            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. 
        code_type(str): The code type used with the target box. It can be
            `encode_center_size` or `decode_center_size`. `encode_center_size` 
            by default.
        box_normalized(bool): Whether treat the priorbox as a normalized box.
            Set true by default.
        axis(int): Which axis in PriorBox to broadcast for box decode, 
            for example, if axis is 0 and TargetBox has shape [N, M, 4] and 
            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
            for decoding. It is only valid when code type is 
            `decode_center_size`. Set 0 by default. 
        name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default. 

    Returns:
        Tensor:
        output_box(Tensor): When code_type is 'encode_center_size', the 
        output tensor of box_coder_op with shape [N, M, 4] representing the 
        result of N target boxes encoded with M Prior boxes and variances. 
        When code_type is 'decode_center_size', N represents the batch size 
        and M represents the number of decoded boxes.

    Examples:
 
        .. code-block:: python
 
            import paddle
            from ppdet.modeling import ops
            paddle.enable_static()
            # For encode
            prior_box_encode = paddle.static.data(name='prior_box_encode',
                                  shape=[512, 4],
                                  dtype='float32')
            target_box_encode = paddle.static.data(name='target_box_encode',
                                   shape=[81, 4],
                                   dtype='float32')
            output_encode = ops.box_coder(prior_box=prior_box_encode,
                                    prior_box_var=[0.1,0.1,0.2,0.2],
                                    target_box=target_box_encode,
                                    code_type="encode_center_size")
            # For decode
            prior_box_decode = paddle.static.data(name='prior_box_decode',
                                  shape=[512, 4],
                                  dtype='float32')
            target_box_decode = paddle.static.data(name='target_box_decode',
                                   shape=[512, 81, 4],
                                   dtype='float32')
            output_decode = ops.box_coder(prior_box=prior_box_decode,
                                    prior_box_var=[0.1,0.1,0.2,0.2],
                                    target_box=target_box_decode,
                                    code_type="decode_center_size",
                                    box_normalized=False,
                                    axis=1)
    """
    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
                             'box_coder')
    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
                             'box_coder')

    if in_dynamic_mode():
        if isinstance(prior_box_var, Variable):
            output_box = C_ops.box_coder(
                prior_box, prior_box_var, target_box, "code_type", code_type,
                "box_normalized", box_normalized, "axis", axis)

        elif isinstance(prior_box_var, list):
            output_box = C_ops.box_coder(
                prior_box, None, target_box, "code_type", code_type,
                "box_normalized", box_normalized, "axis", axis, "variance",
                prior_box_var)
        else:
            raise TypeError(
                "Input variance of box_coder must be Variable or list")
        return output_box
    else:
        helper = LayerHelper("box_coder", **locals())

        output_box = helper.create_variable_for_type_inference(
            dtype=prior_box.dtype)

        inputs = {"PriorBox": prior_box, "TargetBox": target_box}
        attrs = {
            "code_type": code_type,
            "box_normalized": box_normalized,
            "axis": axis
        }
        if isinstance(prior_box_var, Variable):
            inputs['PriorBoxVar'] = prior_box_var
        elif isinstance(prior_box_var, list):
            attrs['variance'] = prior_box_var
        else:
            raise TypeError(
                "Input variance of box_coder must be Variable or list")
        helper.append_op(
            type="box_coder",
            inputs=inputs,
            attrs=attrs,
            outputs={"OutputBox": output_box})
        return output_box


@paddle.jit.not_to_static
def generate_proposals(scores,
                       bbox_deltas,
                       im_shape,
                       anchors,
                       variances,
                       pre_nms_top_n=6000,
                       post_nms_top_n=1000,
                       nms_thresh=0.5,
                       min_size=0.1,
                       eta=1.0,
                       pixel_offset=False,
                       return_rois_num=False,
                       name=None):
    """
    **Generate proposal Faster-RCNN**
    This operation proposes RoIs according to each box with their
    probability to be a foreground object and 
    the box can be calculated by anchors. Bbox_deltais and scores
    to be an object are the output of RPN. Final proposals
    could be used to train detection net.
    For generating proposals, this operation performs following steps:
    1. Transposes and resizes scores and bbox_deltas in size of
       (H*W*A, 1) and (H*W*A, 4)
    2. Calculate box locations as proposals candidates. 
    3. Clip boxes to image
    4. Remove predicted boxes with small area. 
    5. Apply NMS to get final proposals as output.
    Args:
        scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents
            the probability for each box to be an object.
            N is batch size, A is number of anchors, H and W are height and
            width of the feature map. The data type must be float32.
        bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
            represents the difference between predicted box location and
            anchor location. The data type must be float32.
        im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the
            origin image size or input size. The data type can be float32 or 
            float64.
        anchors(Tensor):   A 4-D Tensor represents the anchors with a layout
            of [H, W, A, 4]. H and W are height and width of the feature map,
            num_anchors is the box count of each position. Each anchor is
            in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
        variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
            [H, W, num_priors, 4]. Each variance is in
            (xcenter, ycenter, w, h) format. The data type must be float32.
        pre_nms_top_n(float): Number of total bboxes to be kept per
            image before NMS. The data type must be float32. `6000` by default.
        post_nms_top_n(float): Number of total bboxes to be kept per
            image after NMS. The data type must be float32. `1000` by default.
        nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
        min_size(float): Remove predicted boxes with either height or
            width < min_size. The data type must be float32. `0.1` by default.
        eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
            `adaptive_threshold = adaptive_threshold * eta` in each iteration.
        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's 
            num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
            'False' by default. 
        name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default. 

    Returns:
        tuple:
        A tuple with format ``(rpn_rois, rpn_roi_probs)``.
        - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
        - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.

    Examples:
        .. code-block:: python
        
            import paddle
            from ppdet.modeling import ops
            paddle.enable_static()
            scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
            bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
            im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')
            anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
            variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
            rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,
                         im_shape, anchors, variances)
    """
    if in_dynamic_mode():
        assert return_rois_num, "return_rois_num should be True in dygraph mode."
        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
                 'pixel_offset', pixel_offset)
        rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2(
            scores, bbox_deltas, im_shape, anchors, variances, *attrs)
        if not return_rois_num:
            rpn_rois_num = None
        return rpn_rois, rpn_roi_probs, rpn_rois_num

    else:
        helper = LayerHelper('generate_proposals_v2', **locals())

        check_variable_and_dtype(scores, 'scores', ['float32'],
                                 'generate_proposals_v2')
        check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
                                 'generate_proposals_v2')
        check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'],
                                 'generate_proposals_v2')
        check_variable_and_dtype(anchors, 'anchors', ['float32'],
                                 'generate_proposals_v2')
        check_variable_and_dtype(variances, 'variances', ['float32'],
                                 'generate_proposals_v2')

        rpn_rois = helper.create_variable_for_type_inference(
            dtype=bbox_deltas.dtype)
        rpn_roi_probs = helper.create_variable_for_type_inference(
            dtype=scores.dtype)
        outputs = {
            'RpnRois': rpn_rois,
            'RpnRoiProbs': rpn_roi_probs,
        }
        if return_rois_num:
            rpn_rois_num = helper.create_variable_for_type_inference(
                dtype='int32')
            rpn_rois_num.stop_gradient = True
            outputs['RpnRoisNum'] = rpn_rois_num

        helper.append_op(
            type="generate_proposals_v2",
            inputs={
                'Scores': scores,
                'BboxDeltas': bbox_deltas,
                'ImShape': im_shape,
                'Anchors': anchors,
                'Variances': variances
            },
            attrs={
                'pre_nms_topN': pre_nms_top_n,
                'post_nms_topN': post_nms_top_n,
                'nms_thresh': nms_thresh,
                'min_size': min_size,
                'eta': eta,
                'pixel_offset': pixel_offset
            },
            outputs=outputs)
        rpn_rois.stop_gradient = True
        rpn_roi_probs.stop_gradient = True
        if not return_rois_num:
            rpn_rois_num = None

        return rpn_rois, rpn_roi_probs, rpn_rois_num


def sigmoid_cross_entropy_with_logits(input,
                                      label,
                                      ignore_index=-100,
                                      normalize=False):
    output = F.binary_cross_entropy_with_logits(input, label, reduction='none')
    mask_tensor = paddle.cast(label != ignore_index, 'float32')
    output = paddle.multiply(output, mask_tensor)
    if normalize:
        sum_valid_mask = paddle.sum(mask_tensor)
        output = output / sum_valid_mask
    return output


def smooth_l1(input, label, inside_weight=None, outside_weight=None,
              sigma=None):
    input_new = paddle.multiply(input, inside_weight)
    label_new = paddle.multiply(label, inside_weight)
    delta = 1 / (sigma * sigma)
    out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)
    out = paddle.multiply(out, outside_weight)
    out = out / delta
    out = paddle.reshape(out, shape=[out.shape[0], -1])
    out = paddle.sum(out, axis=1)
    return out


def channel_shuffle(x, groups):
    batch_size, num_channels, height, width = x.shape[0:4]
    assert num_channels % groups == 0, 'num_channels should be divisible by groups'
    channels_per_group = num_channels // groups
    x = paddle.reshape(
        x=x, shape=[batch_size, groups, channels_per_group, height, width])
    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
    return x


def get_static_shape(tensor):
    shape = paddle.shape(tensor)
    shape.stop_gradient = True
    return shape


================================================
FILE: ppdet/modeling/post_process.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import nonempty_bbox
from .transformers import bbox_cxcywh_to_xyxy
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence

__all__ = [
    'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess',
    'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess',
    'DETRBBoxSemiPostProcess'
]


@register
class BBoxPostProcess(object):
    __shared__ = ['num_classes', 'export_onnx', 'export_eb']
    __inject__ = ['decode', 'nms']

    def __init__(self,
                 num_classes=80,
                 decode=None,
                 nms=None,
                 export_onnx=False,
                 export_eb=False):
        super(BBoxPostProcess, self).__init__()
        self.num_classes = num_classes
        self.decode = decode
        self.nms = nms
        self.export_onnx = export_onnx
        self.export_eb = export_eb

    def __call__(self, head_out, rois, im_shape, scale_factor):
        """
        Decode the bbox and do NMS if needed.

        Args:
            head_out (tuple): bbox_pred and cls_prob of bbox_head output.
            rois (tuple): roi and rois_num of rpn_head output.
            im_shape (Tensor): The shape of the input image.
            scale_factor (Tensor): The scale factor of the input image.
            export_onnx (bool): whether export model to onnx
        Returns:
            bbox_pred (Tensor): The output prediction with shape [N, 6], including
                labels, scores and bboxes. The size of bboxes are corresponding
                to the input image, the bboxes may be used in other branch.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
        """
        if self.nms is not None:
            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
            bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score,
                                                               self.num_classes)

        else:
            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
                                              scale_factor)

        if self.export_onnx:
            # add fake box after postprocess when exporting onnx 
            fake_bboxes = paddle.to_tensor(
                np.array(
                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))

            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])
            bbox_num = bbox_num + 1

        if self.nms is not None:
            return bbox_pred, bbox_num, before_nms_indexes
        else:
            return bbox_pred, bbox_num

    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
        """
        Rescale, clip and filter the bbox from the output of NMS to 
        get final prediction. 

        Notes:
        Currently only support bs = 1.

        Args:
            bboxes (Tensor): The output bboxes with shape [N, 6] after decode
                and NMS, including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
            im_shape (Tensor): The shape of the input image.
            scale_factor (Tensor): The scale factor of the input image.
        Returns:
            pred_result (Tensor): The final prediction results with shape [N, 6]
                including labels, scores and bboxes.
        """
        if self.export_eb:
            # enable rcnn models for edgeboard hw to skip the following postprocess.
            return bboxes, bboxes, bbox_num

        if not self.export_onnx:
            bboxes_list = []
            bbox_num_list = []
            id_start = 0
            fake_bboxes = paddle.to_tensor(
                np.array(
                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))

            # add fake bbox when output is empty for each batch
            for i in range(bbox_num.shape[0]):
                if bbox_num[i] == 0:
                    bboxes_i = fake_bboxes
                    bbox_num_i = fake_bbox_num
                else:
                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
                    bbox_num_i = bbox_num[i:i + 1]
                    # id_start: 0-dim, bbox_num: 1-dim. Use bbox_num[i] instead of bbox_num[i:i+1] in pir.
                    id_start += bbox_num[i]
                bboxes_list.append(bboxes_i)
                bbox_num_list.append(bbox_num_i)
            bboxes = paddle.concat(bboxes_list)
            bbox_num = paddle.concat(bbox_num_list)

        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)

        if not self.export_onnx:
            origin_shape_list = []
            scale_factor_list = []
            # scale_factor: scale_y, scale_x
            for i in range(bbox_num.shape[0]):
                expand_shape = paddle.expand(origin_shape[i:i + 1, :],
                                             [bbox_num[i:i + 1], 2])                          
                scale_y, scale_x = scale_factor[i, 0], scale_factor[i, 1]
                # TODO(PIR): something wrong with slice op, remove unsqueeze in the future.
                scale_y = paddle.unsqueeze(scale_y, 0)
                scale_x = paddle.unsqueeze(scale_x, 0)
                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
                expand_scale = paddle.expand(scale, [bbox_num[i:i + 1], 4])
                origin_shape_list.append(expand_shape)
                scale_factor_list.append(expand_scale)

            self.origin_shape_list = paddle.concat(origin_shape_list)
            scale_factor_list = paddle.concat(scale_factor_list)

        else:
            # simplify the computation for bs=1 when exporting onnx
            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]
            scale = paddle.concat(
                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)
            self.origin_shape_list = paddle.expand(origin_shape,
                                                   [bbox_num[0:1], 2])
            scale_factor_list = paddle.expand(scale, [bbox_num[0:1], 4])

        # bboxes: [N, 6], label, score, bbox
        pred_label = bboxes[:, 0:1]
        pred_score = bboxes[:, 1:2]
        pred_bbox = bboxes[:, 2:]
        # rescale bbox to original image
        scaled_bbox = pred_bbox / scale_factor_list
        origin_h = self.origin_shape_list[:, 0]
        origin_w = self.origin_shape_list[:, 1]
        zeros = paddle.zeros_like(origin_h)
        # clip bbox to [0, original_size]
        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)
        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)
        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)
        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)
        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
        # filter empty bbox
        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)
        keep_mask = paddle.unsqueeze(keep_mask, [1])
        pred_label = paddle.where(keep_mask, pred_label,
                                  paddle.ones_like(pred_label) * -1)
        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)
        return bboxes, pred_result, bbox_num

    def get_origin_shape(self, ):
        return self.origin_shape_list


@register
class MaskPostProcess(object):
    __shared__ = ['export_onnx', 'assign_on_cpu']
    """
    refer to:
    https://github.com/facebookresearch/detectron2/layers/mask_ops.py

    Get Mask output according to the output from model
    """

    def __init__(self,
                 binary_thresh=0.5,
                 export_onnx=False,
                 assign_on_cpu=False):
        super(MaskPostProcess, self).__init__()
        self.binary_thresh = binary_thresh
        self.export_onnx = export_onnx
        self.assign_on_cpu = assign_on_cpu

    def __call__(self, mask_out, bboxes, bbox_num, origin_shape):
        """
        Decode the mask_out and paste the mask to the origin image.

        Args:
            mask_out (Tensor): mask_head output with shape [N, 28, 28].
            bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode
                and NMS, including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
            origin_shape (Tensor): The origin shape of the input image, the tensor
                shape is [N, 2], and each row is [h, w].
        Returns:
            pred_result (Tensor): The final prediction mask results with shape
                [N, h, w] in binary mask style.
        """
        num_mask = mask_out.shape[0]
        origin_shape = paddle.cast(origin_shape, 'int32')
        device = paddle.device.get_device()

        if self.export_onnx:
            h, w = origin_shape[0][0], origin_shape[0][1]
            mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w,
                                   self.assign_on_cpu)
            mask_onnx = mask_onnx >= self.binary_thresh
            pred_result = paddle.cast(mask_onnx, 'int32')

        else:
            max_h = paddle.max(origin_shape[:, 0])
            max_w = paddle.max(origin_shape[:, 1])
            pred_result = paddle.zeros(
                [num_mask, max_h, max_w], dtype='int32') - 1

            id_start = 0
            for i in range(bbox_num.shape[0]):
                bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
                mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :]
                im_h = origin_shape[i, 0]
                im_w = origin_shape[i, 1]
                pred_mask = paste_mask(mask_out_i[:, None, :, :],
                                       bboxes_i[:, 2:], im_h, im_w,
                                       self.assign_on_cpu)
                pred_mask = paddle.cast(pred_mask >= self.binary_thresh,
                                        'int32')
                pred_result[id_start:id_start + bbox_num[i], :im_h, :
                            im_w] = pred_mask
                id_start += bbox_num[i]
        if self.assign_on_cpu:
            paddle.set_device(device)

        return pred_result


@register
class JDEBBoxPostProcess(nn.Layer):
    __shared__ = ['num_classes']
    __inject__ = ['decode', 'nms']

    def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True):
        super(JDEBBoxPostProcess, self).__init__()
        self.num_classes = num_classes
        self.decode = decode
        self.nms = nms
        self.return_idx = return_idx

        self.fake_bbox_pred = paddle.to_tensor(
            np.array(
                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
        self.fake_nms_keep_idx = paddle.to_tensor(
            np.array(
                [[0]], dtype='int32'))

        self.fake_yolo_boxes_out = paddle.to_tensor(
            np.array(
                [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32'))
        self.fake_yolo_scores_out = paddle.to_tensor(
            np.array(
                [[[0.0]]], dtype='float32'))
        self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64'))

    def forward(self, head_out, anchors):
        """
        Decode the bbox and do NMS for JDE model. 

        Args:
            head_out (list): Bbox_pred and cls_prob of bbox_head output.
            anchors (list): Anchors of JDE model.

        Returns:
            boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. 
            bbox_pred (Tensor): The output is the prediction with shape [N, 6]
                including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction of each batch with shape [N].
            nms_keep_idx (Tensor): The index of kept bboxes after NMS. 
        """
        boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors)

        if len(boxes_idx) == 0:
            boxes_idx = self.fake_boxes_idx
            yolo_boxes_out = self.fake_yolo_boxes_out
            yolo_scores_out = self.fake_yolo_scores_out
        else:
            yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx)
            # TODO: only support bs=1 now
            yolo_boxes_out = paddle.reshape(
                yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4])
            yolo_scores_out = paddle.reshape(
                yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)])
            boxes_idx = boxes_idx[:, 1:]

        if self.return_idx:
            bbox_pred, bbox_num, nms_keep_idx = self.nms(
                yolo_boxes_out, yolo_scores_out, self.num_classes)
            if bbox_pred.shape[0] == 0:
                bbox_pred = self.fake_bbox_pred
                bbox_num = self.fake_bbox_num
                nms_keep_idx = self.fake_nms_keep_idx
            return boxes_idx, bbox_pred, bbox_num, nms_keep_idx
        else:
            bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out,
                                              self.num_classes)
            if bbox_pred.shape[0] == 0:
                bbox_pred = self.fake_bbox_pred
                bbox_num = self.fake_bbox_num
            return _, bbox_pred, bbox_num, _


@register
class CenterNetPostProcess(object):
    """
    Postprocess the model outputs to get final prediction:
        1. Do NMS for heatmap to get top `max_per_img` bboxes.
        2. Decode bboxes using center offset and box size.
        3. Rescale decoded bboxes reference to the origin image shape.
    Args:
        max_per_img(int): the maximum number of predicted objects in a image,
            500 by default.
        down_ratio(int): the down ratio from images to heatmap, 4 by default.
        regress_ltrb (bool): whether to regress left/top/right/bottom or
            width/height for a box, true by default.
    """
    __shared__ = ['down_ratio']

    def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True):
        super(CenterNetPostProcess, self).__init__()
        self.max_per_img = max_per_img
        self.down_ratio = down_ratio
        self.regress_ltrb = regress_ltrb
        # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py

    def _simple_nms(self, heat, kernel=3):
        """ Use maxpool to filter the max score, get local peaks. """
        pad = (kernel - 1) // 2
        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
        keep = paddle.cast(hmax == heat, 'float32')
        return heat * keep

    def _topk(self, scores):
        """ Select top k scores and decode to get xy coordinates. """
        k = self.max_per_img
        shape_fm = paddle.shape(scores)
        shape_fm.stop_gradient = True
        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
        # batch size is 1
        scores_r = paddle.reshape(scores, [cat, -1])
        topk_scores, topk_inds = paddle.topk(scores_r, k)
        topk_ys = topk_inds // width
        topk_xs = topk_inds % width

        topk_score_r = paddle.reshape(topk_scores, [-1])
        topk_score, topk_ind = paddle.topk(topk_score_r, k)
        k_t = paddle.full(topk_ind.shape, k, dtype='int64')
        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')

        topk_inds = paddle.reshape(topk_inds, [-1])
        topk_ys = paddle.reshape(topk_ys, [-1, 1])
        topk_xs = paddle.reshape(topk_xs, [-1, 1])
        topk_inds = paddle.gather(topk_inds, topk_ind)
        topk_ys = paddle.gather(topk_ys, topk_ind)
        topk_xs = paddle.gather(topk_xs, topk_ind)
        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs

    def __call__(self, hm, wh, reg, im_shape, scale_factor):
        # 1.get clses and scores, note that hm had been done sigmoid
        heat = self._simple_nms(hm)
        scores, inds, topk_clses, ys, xs = self._topk(heat)
        clses = topk_clses.unsqueeze(1)
        scores = scores.unsqueeze(1)

        # 2.get bboxes, note only support batch_size=1 now
        reg_t = paddle.transpose(reg, [0, 2, 3, 1])
        reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]])
        reg = paddle.gather(reg, inds)
        xs = paddle.cast(xs, 'float32')
        ys = paddle.cast(ys, 'float32')
        xs = xs + reg[:, 0:1]
        ys = ys + reg[:, 1:2]
        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
        wh = paddle.gather(wh, inds)
        if self.regress_ltrb:
            x1 = xs - wh[:, 0:1]
            y1 = ys - wh[:, 1:2]
            x2 = xs + wh[:, 2:3]
            y2 = ys + wh[:, 3:4]
        else:
            x1 = xs - wh[:, 0:1] / 2
            y1 = ys - wh[:, 1:2] / 2
            x2 = xs + wh[:, 0:1] / 2
            y2 = ys + wh[:, 1:2] / 2
        n, c, feat_h, feat_w = paddle.shape(hm)
        padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2
        padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2
        x1 = x1 * self.down_ratio
        y1 = y1 * self.down_ratio
        x2 = x2 * self.down_ratio
        y2 = y2 * self.down_ratio
        x1 = x1 - padw
        y1 = y1 - padh
        x2 = x2 - padw
        y2 = y2 - padh
        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
        scale_y = scale_factor[:, 0:1]
        scale_x = scale_factor[:, 1:2]
        scale_expand = paddle.concat(
            [scale_x, scale_y, scale_x, scale_y], axis=1)
        boxes_shape = bboxes.shape[:]
        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
        bboxes = paddle.divide(bboxes, scale_expand)

        results = paddle.concat([clses, scores, bboxes], axis=1)
        return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs


@register
class DETRPostProcess(object):
    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
    __inject__ = []

    def __init__(self,
                 num_classes=80,
                 num_top_queries=100,
                 dual_queries=False,
                 dual_groups=0,
                 use_focal_loss=False,
                 with_mask=False,
                 mask_stride=4,
                 mask_threshold=0.5,
                 use_avg_mask_score=False,
                 bbox_decode_type='origin'):
        super(DETRPostProcess, self).__init__()
        assert bbox_decode_type in ['origin', 'pad']

        self.num_classes = num_classes
        self.num_top_queries = num_top_queries
        self.dual_queries = dual_queries
        self.dual_groups = dual_groups
        self.use_focal_loss = use_focal_loss
        self.with_mask = with_mask
        self.mask_stride = mask_stride
        self.mask_threshold = mask_threshold
        self.use_avg_mask_score = use_avg_mask_score
        self.bbox_decode_type = bbox_decode_type

    def _mask_postprocess(self, mask_pred, score_pred):
        mask_score = F.sigmoid(mask_pred)
        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
        if self.use_avg_mask_score:
            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
                mask_pred.sum([-2, -1]) + 1e-6)
            score_pred *= avg_mask_score

        return mask_pred.flatten(0, 1).astype('int32'), score_pred

    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
        """
        Decode the bbox and mask.

        Args:
            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
            im_shape (Tensor): The shape of the input image without padding.
            scale_factor (Tensor): The scale factor of the input image.
            pad_shape (Tensor): The shape of the input image with padding.
        Returns:
            bbox_pred (Tensor): The output prediction with shape [N, 6], including
                labels, scores and bboxes. The size of bboxes are corresponding
                to the input image, the bboxes may be used in other branch.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [bs], and is N.
        """
        bboxes, logits, masks = head_out
        if self.dual_queries:
            num_queries = logits.shape[1]
            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]

        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
        # calculate the original shape of the image
        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
        if self.bbox_decode_type == 'pad':
            # calculate the shape of the image with padding
            out_shape = pad_shape / im_shape * origin_shape
            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
        elif self.bbox_decode_type == 'origin':
            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
        else:
            raise Exception(
                f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
        bbox_pred *= out_shape

        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
            logits)[:, :, :-1]

        if not self.use_focal_loss:
            scores, labels = scores.max(-1), scores.argmax(-1)
            if scores.shape[1] > self.num_top_queries:
                scores, index = paddle.topk(
                    scores, self.num_top_queries, axis=-1)
                batch_ind = paddle.arange(
                    end=scores.shape[0]).unsqueeze(-1).tile(
                        [1, self.num_top_queries])
                index = paddle.stack([batch_ind, index], axis=-1)
                labels = paddle.gather_nd(labels, index)
                bbox_pred = paddle.gather_nd(bbox_pred, index)
        else:
            scores, index = paddle.topk(
                scores.flatten(1), self.num_top_queries, axis=-1)
            labels = index % self.num_classes
            index = index // self.num_classes
            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
                [1, self.num_top_queries])
            index = paddle.stack([batch_ind, index], axis=-1)
            bbox_pred = paddle.gather_nd(bbox_pred, index)

        mask_pred = None
        if self.with_mask:
            assert masks is not None
            assert masks.shape[0] == 1
            masks = paddle.gather_nd(masks, index)
            if self.bbox_decode_type == 'pad':
                masks = F.interpolate(
                    masks,
                    scale_factor=self.mask_stride,
                    mode="bilinear",
                    align_corners=False)
                # TODO: Support prediction with bs>1.
                # remove padding for input image
                h, w = im_shape.astype('int32')[0]
                masks = masks[..., :h, :w]
            # get pred_mask in the original resolution.
            img_h = img_h[0].astype('int32')
            img_w = img_w[0].astype('int32')
            masks = F.interpolate(
                masks,
                size=[img_h, img_w],
                mode="bilinear",
                align_corners=False)
            mask_pred, scores = self._mask_postprocess(masks, scores)

        bbox_pred = paddle.concat(
            [
                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
                bbox_pred
            ],
            axis=-1)
        bbox_num = paddle.to_tensor(
            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
        bbox_pred = bbox_pred.reshape([-1, 6])
        return bbox_pred, bbox_num, mask_pred


@register
class SparsePostProcess(object):
    __shared__ = ['num_classes', 'assign_on_cpu']

    def __init__(self,
                 num_proposals,
                 num_classes=80,
                 binary_thresh=0.5,
                 assign_on_cpu=False):
        super(SparsePostProcess, self).__init__()
        self.num_classes = num_classes
        self.num_proposals = num_proposals
        self.binary_thresh = binary_thresh
        self.assign_on_cpu = assign_on_cpu

    def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None):
        assert len(scores) == len(bboxes) == \
               len(ori_shape) == len(scale_factor)
        device = paddle.device.get_device()
        batch_size = len(ori_shape)

        scores = F.sigmoid(scores)
        has_mask = masks is not None
        if has_mask:
            masks = F.sigmoid(masks)
            masks = masks.reshape([batch_size, -1, *masks.shape[1:]])

        bbox_pred = []
        mask_pred = [] if has_mask else None
        bbox_num = paddle.zeros([batch_size], dtype='int32')
        for i in range(batch_size):
            score = scores[i]
            bbox = bboxes[i]
            score, indices = score.flatten(0, 1).topk(
                self.num_proposals, sorted=False)
            label = indices % self.num_classes
            if has_mask:
                mask = masks[i]
                mask = mask.flatten(0, 1)[indices]

            H, W = ori_shape[i][0], ori_shape[i][1]
            bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)]
            bbox /= scale_factor[i]
            bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W)
            bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H)

            keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \
                   ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.)
            if keep.sum() == 0:
                bbox = paddle.zeros([1, 6], dtype='float32')
                if has_mask:
                    mask = paddle.zeros([1, H, W], dtype='uint8')
            else:
                label = paddle.to_tensor(label.numpy()[keep]).astype(
                    'float32').unsqueeze(-1)
                score = paddle.to_tensor(score.numpy()[keep]).astype(
                    'float32').unsqueeze(-1)
                bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32')
                if has_mask:
                    mask = paddle.to_tensor(mask.numpy()[keep]).astype(
                        'float32').unsqueeze(1)
                    mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu)
                    mask = paddle.cast(mask >= self.binary_thresh, 'uint8')
                bbox = paddle.concat([label, score, bbox], axis=-1)

            bbox_num[i] = bbox.shape[0]
            bbox_pred.append(bbox)
            if has_mask:
                mask_pred.append(mask)

        bbox_pred = paddle.concat(bbox_pred)
        mask_pred = paddle.concat(mask_pred) if has_mask else None

        if self.assign_on_cpu:
            paddle.set_device(device)

        if has_mask:
            return bbox_pred, bbox_num, mask_pred
        else:
            return bbox_pred, bbox_num


def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
    """
    Paste the mask prediction to the original image.
    """
    x0_int, y0_int = 0, 0
    x1_int, y1_int = im_w, im_h
    x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
    N = masks.shape[0]
    img_y = paddle.arange(y0_int, y1_int) + 0.5
    img_x = paddle.arange(x0_int, x1_int) + 0.5

    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
    # img_x, img_y have shapes (N, w), (N, h)

    if assign_on_cpu:
        paddle.set_device('cpu')
    gx = img_x[:, None, :].expand(
        [N, img_y.shape[1], img_x.shape[1]])
    gy = img_y[:, :, None].expand(
        [N, img_y.shape[1], img_x.shape[1]])
    grid = paddle.stack([gx, gy], axis=3)
    img_masks = F.grid_sample(masks, grid, align_corners=False)
    return img_masks[:, 0]


def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
    final_boxes = []
    for c in range(num_classes):
        idxs = bboxs[:, 0] == c
        if np.count_nonzero(idxs) == 0: continue
        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
    return final_boxes


def nms(dets, match_threshold=0.6, match_metric='iou'):
    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
        Args:
            dets: shape [N, 5], [score, x1, y1, x2, y2]
            match_metric: 'iou' or 'ios'
            match_threshold: overlap thresh for match metric.
    """
    if dets.shape[0] == 0:
        return dets[[], :]
    scores = dets[:, 0]
    x1 = dets[:, 1]
    y1 = dets[:, 2]
    x2 = dets[:, 3]
    y2 = dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h

        if match_metric == 'iou':
            union = areas[i] + areas[order[1:]] - inter
            match_value = inter / union
        elif match_metric == 'ios':
            smaller = np.minimum(areas[i], areas[order[1:]])
            match_value = inter / smaller
        else:
            raise ValueError()

        inds = np.where(match_value < match_threshold)[0]
        order = order[inds + 1]

    dets = dets[keep, :]
    return dets


@register
class DETRBBoxSemiPostProcess(object):
    __shared__ = ['num_classes', 'use_focal_loss']
    __inject__ = []

    def __init__(self,
                 num_classes=80,
                 num_top_queries=100,
                 use_focal_loss=False):
        super(DETRBBoxSemiPostProcess, self).__init__()
        self.num_classes = num_classes
        self.num_top_queries = num_top_queries
        self.use_focal_loss = use_focal_loss

    def __call__(self, head_out):
        """
        Decode the bbox.
        Args:
            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
            im_shape (Tensor): The shape of the input image.
            scale_factor (Tensor): The scale factor of the input image.
        Returns:
            bbox_pred (Tensor): The output prediction with shape [N, 6], including
                labels, scores and bboxes. The size of bboxes are corresponding
                to the input image, the bboxes may be used in other branch.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [bs], and is N.
        """
        bboxes, logits, masks = head_out
        bbox_pred = bboxes

        scores = F.softmax(logits, axis=2)

        import copy
        soft_scores = copy.deepcopy(scores)
        scores, index = paddle.topk(scores.max(-1), 300, axis=-1)

        batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
            [1, 300])
        index = paddle.stack([batch_ind, index], axis=-1)
        labels = paddle.gather_nd(soft_scores.argmax(-1), index).astype('int32')
        score_class = paddle.gather_nd(soft_scores, index)
        bbox_pred = paddle.gather_nd(bbox_pred, index)
        bbox_pred = paddle.concat(
            [
                labels.unsqueeze(-1).astype('float32'), score_class,
                scores.unsqueeze(-1), bbox_pred
            ],
            axis=-1)
        bbox_num = paddle.to_tensor(
            bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]])
        bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])
        return bbox_pred, bbox_num

================================================
FILE: ppdet/modeling/proposal_generator/__init__.py
================================================
from . import rpn_head
from . import embedding_rpn_head

from .rpn_head import *
from .embedding_rpn_head import *


================================================
FILE: ppdet/modeling/proposal_generator/anchor_generator.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The code is based on 
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py

import math

import paddle
import paddle.nn as nn
import numpy as np

from ppdet.core.workspace import register

__all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator']


@register
class AnchorGenerator(nn.Layer):
    """
    Generate anchors according to the feature maps

    Args:
        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at 
            each feature point. list[float] means all feature levels share the 
            same sizes. list[list[float]] means the anchor sizes for 
            each level. The sizes stand for the scale of input size.
        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at
            each feature point. list[float] means all feature levels share the
            same ratios. list[list[float]] means the aspect ratios for
            each level.
        strides (list[float]): The strides of feature maps which generate 
            anchors
        offset (float): The offset of the coordinate of anchors, default 0.
        
    """

    def __init__(self,
                 anchor_sizes=[32, 64, 128, 256, 512],
                 aspect_ratios=[0.5, 1.0, 2.0],
                 strides=[16.0],
                 variance=[1.0, 1.0, 1.0, 1.0],
                 offset=0.):
        super(AnchorGenerator, self).__init__()
        self.anchor_sizes = anchor_sizes
        self.aspect_ratios = aspect_ratios
        self.strides = strides
        self.variance = variance
        self.cell_anchors = self._calculate_anchors(len(strides))
        self.offset = offset

    def _broadcast_params(self, params, num_features):
        if not isinstance(params[0], (list, tuple)):  # list[float]
            return [params] * num_features
        if len(params) == 1:
            return list(params) * num_features
        return params

    def generate_cell_anchors(self, sizes, aspect_ratios):
        anchors = []
        for size in sizes:
            area = size**2.0
            for aspect_ratio in aspect_ratios:
                w = math.sqrt(area / aspect_ratio)
                h = aspect_ratio * w
                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
                anchors.append([x0, y0, x1, y1])
        return paddle.to_tensor(anchors, dtype='float32')

    def _calculate_anchors(self, num_features):
        sizes = self._broadcast_params(self.anchor_sizes, num_features)
        aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features)
        cell_anchors = [
            self.generate_cell_anchors(s, a)
            for s, a in zip(sizes, aspect_ratios)
        ]
        [
            self.register_buffer(
                t.name, t, persistable=False) for t in cell_anchors
        ]
        return cell_anchors

    def _create_grid_offsets(self, size, stride, offset):
        grid_height, grid_width = size[0], size[1]
        shifts_x = paddle.arange(
            offset * stride, grid_width * stride, step=stride, dtype='float32')
        shifts_y = paddle.arange(
            offset * stride, grid_height * stride, step=stride, dtype='float32')
        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)
        shift_x = paddle.reshape(shift_x, [-1])
        shift_y = paddle.reshape(shift_y, [-1])
        return shift_x, shift_y

    def _grid_anchors(self, grid_sizes):
        anchors = []
        for size, stride, base_anchors in zip(grid_sizes, self.strides,
                                              self.cell_anchors):
            shift_x, shift_y = self._create_grid_offsets(size, stride,
                                                         self.offset)
            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
            shifts = paddle.reshape(shifts, [-1, 1, 4])
            base_anchors = paddle.reshape(base_anchors, [1, -1, 4])

            anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4]))

        return anchors

    def forward(self, input):
        grid_sizes = [feature_map.shape[-2:] for feature_map in input]
        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
        return anchors_over_all_feature_maps

    @property
    def num_anchors(self):
        """
        Returns:
            int: number of anchors at every pixel
                location, on that feature map.
                For example, if at every pixel we use anchors of 3 aspect
                ratios and 5 sizes, the number of anchors is 15.
                For FPN models, `num_anchors` on every feature map is the same.
        """
        return len(self.cell_anchors[0])


@register
class RetinaAnchorGenerator(AnchorGenerator):
    def __init__(self,
                 octave_base_scale=4,
                 scales_per_octave=3,
                 aspect_ratios=[0.5, 1.0, 2.0],
                 strides=[8.0, 16.0, 32.0, 64.0, 128.0],
                 variance=[1.0, 1.0, 1.0, 1.0],
                 offset=0.0):
        anchor_sizes = []
        for s in strides:
            anchor_sizes.append([
                s * octave_base_scale * 2**(i/scales_per_octave) \
                for i in range(scales_per_octave)])
        super(RetinaAnchorGenerator, self).__init__(
            anchor_sizes=anchor_sizes,
            aspect_ratios=aspect_ratios,
            strides=strides,
            variance=variance,
            offset=offset)


@register
class S2ANetAnchorGenerator(nn.Layer):
    """
    AnchorGenerator by paddle
    """

    def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
        super(S2ANetAnchorGenerator, self).__init__()
        self.base_size = base_size
        self.scales = paddle.to_tensor(scales)
        self.ratios = paddle.to_tensor(ratios)
        self.scale_major = scale_major
        self.ctr = ctr
        self.base_anchors = self.gen_base_anchors()

    @property
    def num_base_anchors(self):
        return self.base_anchors.shape[0]

    def gen_base_anchors(self):
        w = self.base_size
        h = self.base_size
        if self.ctr is None:
            x_ctr = 0.5 * (w - 1)
            y_ctr = 0.5 * (h - 1)
        else:
            x_ctr, y_ctr = self.ctr

        h_ratios = paddle.sqrt(self.ratios)
        w_ratios = 1 / h_ratios
        if self.scale_major:
            ws = (w * w_ratios[:] * self.scales[:].astype(w_ratios.dtype)).reshape([-1])
            hs = (h * h_ratios[:] * self.scales[:].astype(h_ratios.dtype)).reshape([-1])
        else:
            ws = (w * self.scales[:].astype(w_ratios.dtype) * w_ratios[:]).reshape([-1])
            hs = (h * self.scales[:].astype(h_ratios.dtype) * h_ratios[:]).reshape([-1])

        base_anchors = paddle.stack(
            [
                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
            ],
            axis=-1)
        base_anchors = paddle.round(base_anchors)
        return base_anchors

    def _meshgrid(self, x, y, row_major=True):
        yy, xx = paddle.meshgrid(y, x)
        yy = yy.reshape([-1])
        xx = xx.reshape([-1])
        if row_major:
            return xx, yy
        else:
            return yy, xx

    def forward(self, featmap_size, stride=16):
        # featmap_size*stride project it to original area

        feat_h = featmap_size[0]
        feat_w = featmap_size[1]
        shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride
        shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride
        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
        shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1)

        all_anchors = self.base_anchors[:, :] + shifts[:, :].astype(self.base_anchors.dtype)
        all_anchors = all_anchors.cast(paddle.float32).reshape(
            [feat_h * feat_w, 4])
        all_anchors = self.rect2rbox(all_anchors)
        return all_anchors

    def valid_flags(self, featmap_size, valid_size):
        feat_h, feat_w = featmap_size
        valid_h, valid_w = valid_size
        assert valid_h <= feat_h and valid_w <= feat_w
        valid_x = paddle.zeros([feat_w], dtype='int32')
        valid_y = paddle.zeros([feat_h], dtype='int32')
        valid_x[:valid_w] = 1
        valid_y[:valid_h] = 1
        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
        valid = valid_xx & valid_yy
        valid = paddle.reshape(valid, [-1, 1])
        valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1])
        return valid

    def rect2rbox(self, bboxes):
        """
        :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax)
        :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle)
        """
        x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1)

        x_ctr = (x1 + x2) / 2.0
        y_ctr = (y1 + y2) / 2.0
        edges1 = paddle.abs(x2 - x1)
        edges2 = paddle.abs(y2 - y1)

        rbox_w = paddle.maximum(edges1, edges2)
        rbox_h = paddle.minimum(edges1, edges2)

        # set angle
        inds = edges1 < edges2
        inds = paddle.cast(inds, paddle.float32)
        rboxes_angle = inds * np.pi / 2.0

        rboxes = paddle.concat(
            (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1)
        return rboxes


================================================
FILE: ppdet/modeling/proposal_generator/embedding_rpn_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This code is referenced from: https://github.com/open-mmlab/mmdetection

import paddle
from paddle import nn

from ppdet.core.workspace import register

__all__ = ['EmbeddingRPNHead']


@register
class EmbeddingRPNHead(nn.Layer):
    __shared__ = ['proposal_embedding_dim']

    def __init__(self, num_proposals, proposal_embedding_dim=256):
        super(EmbeddingRPNHead, self).__init__()

        self.num_proposals = num_proposals
        self.proposal_embedding_dim = proposal_embedding_dim

        self._init_layers()
        self._init_weights()

    def _init_layers(self):
        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
        self.init_proposal_features = nn.Embedding(self.num_proposals,
                                                   self.proposal_embedding_dim)

    def _init_weights(self):
        init_bboxes = paddle.empty_like(self.init_proposal_bboxes.weight)
        init_bboxes[:, :2] = 0.5
        init_bboxes[:, 2:] = 1.0
        self.init_proposal_bboxes.weight.set_value(init_bboxes)

    @staticmethod
    def bbox_cxcywh_to_xyxy(x):
        cxcy, wh = paddle.split(x, 2, axis=-1)
        return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)

    def forward(self, img_whwh):
        proposal_bboxes = self.init_proposal_bboxes.weight.clone()
        proposal_bboxes = self.bbox_cxcywh_to_xyxy(proposal_bboxes)
        proposal_bboxes = proposal_bboxes.unsqueeze(0) * img_whwh.unsqueeze(1)

        proposal_features = self.init_proposal_features.weight.clone()
        proposal_features = proposal_features.unsqueeze(0).tile(
            [img_whwh.shape[0], 1, 1])

        return proposal_bboxes, proposal_features


================================================
FILE: ppdet/modeling/proposal_generator/proposal_generator.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle

from ppdet.core.workspace import register, serializable
from .. import ops


@register
@serializable
class ProposalGenerator(object):
    """
    Proposal generation module

    For more details, please refer to the document of generate_proposals 
    in ppdet/modeing/ops.py

    Args:
        pre_nms_top_n (int): Number of total bboxes to be kept per
            image before NMS. default 6000
        post_nms_top_n (int): Number of total bboxes to be kept per
            image after NMS. default 1000
        nms_thresh (float): Threshold in NMS. default 0.5
        min_size (flaot): Remove predicted boxes with either height or
             width < min_size. default 0.1
        eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
             `adaptive_threshold = adaptive_threshold * eta` in each iteration.
             default 1.
        topk_after_collect (bool): whether to adopt topk after batch 
             collection. If topk_after_collect is true, box filter will not be 
             used after NMS at each image in proposal generation. default false
    """

    def __init__(self,
                 pre_nms_top_n=12000,
                 post_nms_top_n=2000,
                 nms_thresh=.5,
                 min_size=.1,
                 eta=1.,
                 topk_after_collect=False):
        super(ProposalGenerator, self).__init__()
        self.pre_nms_top_n = pre_nms_top_n
        self.post_nms_top_n = post_nms_top_n
        self.nms_thresh = nms_thresh
        self.min_size = min_size
        self.eta = eta
        self.topk_after_collect = topk_after_collect

    def __call__(self, scores, bbox_deltas, anchors, im_shape):

        top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n
        variances = paddle.ones_like(anchors)
        if hasattr(paddle.vision.ops, "generate_proposals"):
            generate_proposals = getattr(paddle.vision.ops,
                                         "generate_proposals")
        else:
            generate_proposals = ops.generate_proposals
        rpn_rois, rpn_rois_prob, rpn_rois_num = generate_proposals(
            scores,
            bbox_deltas,
            im_shape,
            anchors,
            variances,
            pre_nms_top_n=self.pre_nms_top_n,
            post_nms_top_n=top_n,
            nms_thresh=self.nms_thresh,
            min_size=self.min_size,
            eta=self.eta,
            return_rois_num=True)

        return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n


================================================
FILE: ppdet/modeling/proposal_generator/rpn_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal

from ppdet.core.workspace import register
from .anchor_generator import AnchorGenerator
from .target_layer import RPNTargetAssign
from .proposal_generator import ProposalGenerator
from ..cls_utils import _get_class_default_kwargs


class RPNFeat(nn.Layer):
    """
    Feature extraction in RPN head

    Args:
        in_channel (int): Input channel
        out_channel (int): Output channel
    """

    def __init__(self, in_channel=1024, out_channel=1024):
        super(RPNFeat, self).__init__()
        # rpn feat is shared with each level
        self.rpn_conv = nn.Conv2D(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=3,
            padding=1,
            weight_attr=paddle.ParamAttr(initializer=Normal(
                mean=0., std=0.01)))
        self.rpn_conv.skip_quant = True

    def forward(self, feats):
        rpn_feats = []
        for feat in feats:
            rpn_feats.append(F.relu(self.rpn_conv(feat)))
        return rpn_feats


@register
class RPNHead(nn.Layer):
    """
    Region Proposal Network

    Args:
        anchor_generator (dict): configure of anchor generation
        rpn_target_assign (dict): configure of rpn targets assignment
        train_proposal (dict): configure of proposals generation
            at the stage of training
        test_proposal (dict): configure of proposals generation
            at the stage of prediction
        in_channel (int): channel of input feature maps which can be
            derived by from_config
    """
    __shared__ = ['export_onnx']
    __inject__ = ['loss_rpn_bbox']

    def __init__(self,
                 anchor_generator=_get_class_default_kwargs(AnchorGenerator),
                 rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign),
                 train_proposal=_get_class_default_kwargs(ProposalGenerator,
                                                          12000, 2000),
                 test_proposal=_get_class_default_kwargs(ProposalGenerator),
                 in_channel=1024,
                 export_onnx=False,
                 loss_rpn_bbox=None):
        super(RPNHead, self).__init__()
        self.anchor_generator = anchor_generator
        self.rpn_target_assign = rpn_target_assign
        self.train_proposal = train_proposal
        self.test_proposal = test_proposal
        self.export_onnx = export_onnx
        if isinstance(anchor_generator, dict):
            self.anchor_generator = AnchorGenerator(**anchor_generator)
        if isinstance(rpn_target_assign, dict):
            self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign)
        if isinstance(train_proposal, dict):
            self.train_proposal = ProposalGenerator(**train_proposal)
        if isinstance(test_proposal, dict):
            self.test_proposal = ProposalGenerator(**test_proposal)
        self.loss_rpn_bbox = loss_rpn_bbox

        num_anchors = self.anchor_generator.num_anchors
        self.rpn_feat = RPNFeat(in_channel, in_channel)
        # rpn head is shared with each level
        # rpn roi classification scores
        self.rpn_rois_score = nn.Conv2D(
            in_channels=in_channel,
            out_channels=num_anchors,
            kernel_size=1,
            padding=0,
            weight_attr=paddle.ParamAttr(initializer=Normal(
                mean=0., std=0.01)))
        self.rpn_rois_score.skip_quant = True

        # rpn roi bbox regression deltas
        self.rpn_rois_delta = nn.Conv2D(
            in_channels=in_channel,
            out_channels=4 * num_anchors,
            kernel_size=1,
            padding=0,
            weight_attr=paddle.ParamAttr(initializer=Normal(
                mean=0., std=0.01)))
        self.rpn_rois_delta.skip_quant = True

    @classmethod
    def from_config(cls, cfg, input_shape):
        # FPN share same rpn head
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channel': input_shape.channels}

    def forward(self, feats, inputs):
        rpn_feats = self.rpn_feat(feats)
        scores = []
        deltas = []

        for rpn_feat in rpn_feats:
            rrs = self.rpn_rois_score(rpn_feat)
            rrd = self.rpn_rois_delta(rpn_feat)
            scores.append(rrs)
            deltas.append(rrd)

        anchors = self.anchor_generator(rpn_feats)

        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs)
        if self.training:
            loss = self.get_loss(scores, deltas, anchors, inputs)
            return rois, rois_num, loss
        else:
            return rois, rois_num, None

    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs):
        """
        scores (list[Tensor]): Multi-level scores prediction
        bbox_deltas (list[Tensor]): Multi-level deltas prediction
        anchors (list[Tensor]): Multi-level anchors
        inputs (dict): ground truth info
        """
        prop_gen = self.train_proposal if self.training else self.test_proposal
        im_shape = inputs['im_shape']

        # Collect multi-level proposals for each batch
        # Get 'topk' of them as final output

        if self.export_onnx:
            # bs = 1 when exporting onnx
            onnx_rpn_rois_list = []
            onnx_rpn_prob_list = []
            onnx_rpn_rois_num_list = []

            for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,
                                                    anchors):
                onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen(
                    scores=rpn_score[0:1],
                    bbox_deltas=rpn_delta[0:1],
                    anchors=anchor,
                    im_shape=im_shape[0:1])
                onnx_rpn_rois_list.append(onnx_rpn_rois)
                onnx_rpn_prob_list.append(onnx_rpn_rois_prob)
                onnx_rpn_rois_num_list.append(onnx_rpn_rois_num)

            onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list)
            onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten()

            onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32')
            onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32')
            k = paddle.minimum(onnx_top_n, onnx_num_rois)
            onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k)
            onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds)
            # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch
            # due to problems in dy2static of paddle. Will fix it when updating paddle framework.
            # bs_rois_collect = [onnx_topk_rois]
            # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0]

        else:
            bs_rois_collect = []
            bs_rois_num_collect = []

            batch_size = im_shape.shape[0]

            # Generate proposals for each level and each batch.
            # Discard batch-computing to avoid sorting bbox cross different batches.
            for i in range(batch_size):
                rpn_rois_list = []
                rpn_prob_list = []
                rpn_rois_num_list = []

                for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,
                                                        anchors):
                    rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen(
                        scores=rpn_score[i:i + 1],
                        bbox_deltas=rpn_delta[i:i + 1],
                        anchors=anchor,
                        im_shape=im_shape[i:i + 1])
                    rpn_rois_list.append(rpn_rois)
                    rpn_prob_list.append(rpn_rois_prob)
                    rpn_rois_num_list.append(rpn_rois_num)

                if len(scores) > 1:
                    rpn_rois = paddle.concat(rpn_rois_list)
                    rpn_prob = paddle.concat(rpn_prob_list).flatten()

                    num_rois = rpn_prob.shape[0]
                    num_rois = paddle.shape(rpn_prob)[0].cast('int32')
                    if num_rois > post_nms_top_n:
                        topk_prob, topk_inds = paddle.topk(rpn_prob,
                                                           post_nms_top_n)
                        topk_rois = paddle.gather(rpn_rois, topk_inds)
                    else:
                        topk_rois = rpn_rois
                        topk_prob = rpn_prob
                        topk_inds = paddle.zeros(shape=[post_nms_top_n], dtype="int64")
                else:
                    topk_rois = rpn_rois_list[0]
                    topk_prob = rpn_prob_list[0].flatten()

                bs_rois_collect.append(topk_rois)
                bs_rois_num_collect.append(paddle.shape(topk_rois)[0:1])

                # TODO(PIR): remove this after pir bug fixed
                rpn_rois_list = None
                rpn_prob_list = None
                rpn_rois_num_list = None

            bs_rois_num_collect = paddle.concat(bs_rois_num_collect)

        if self.export_onnx:
            output_rois = [onnx_topk_rois]
            output_rois_num = paddle.shape(onnx_topk_rois)[0]
        else:
            output_rois = bs_rois_collect
            output_rois_num = bs_rois_num_collect

        return output_rois, output_rois_num

    def get_loss(self, pred_scores, pred_deltas, anchors, inputs):
        """
        pred_scores (list[Tensor]): Multi-level scores prediction
        pred_deltas (list[Tensor]): Multi-level deltas prediction
        anchors (list[Tensor]): Multi-level anchors
        inputs (dict): ground truth info, including im, gt_bbox, gt_score
        """
        anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors]
        anchors = paddle.concat(anchors)

        scores = [
            paddle.reshape(
                paddle.transpose(
                    v, perm=[0, 2, 3, 1]),
                shape=(v.shape[0], -1, 1)) for v in pred_scores
        ]
        scores = paddle.concat(scores, axis=1)

        deltas = [
            paddle.reshape(
                paddle.transpose(
                    v, perm=[0, 2, 3, 1]),
                shape=(v.shape[0], -1, 4)) for v in pred_deltas
        ]
        deltas = paddle.concat(deltas, axis=1)

        score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs,
                                                                    anchors)

        scores = paddle.reshape(x=scores, shape=(-1, ))
        deltas = paddle.reshape(x=deltas, shape=(-1, 4))

        score_tgt = paddle.concat(score_tgt)
        score_tgt.stop_gradient = True

        pos_mask = score_tgt == 1
        pos_ind = paddle.nonzero(pos_mask)

        valid_mask = score_tgt >= 0
        valid_ind = paddle.nonzero(valid_mask)

        # cls loss
        if valid_ind.shape[0] == 0:
            loss_rpn_cls = paddle.zeros([1], dtype='float32')
        else:
            score_pred = paddle.gather(scores, valid_ind)
            score_label = paddle.gather(score_tgt, valid_ind).cast('float32')
            score_label.stop_gradient = True
            loss_rpn_cls = F.binary_cross_entropy_with_logits(
                logit=score_pred, label=score_label, reduction="sum")

        # reg loss
        if pos_ind.shape[0] == 0:
            loss_rpn_reg = paddle.zeros([1], dtype='float32')
        else:
            loc_pred = paddle.gather(deltas, pos_ind)
            loc_tgt = paddle.concat(loc_tgt)
            loc_tgt = paddle.gather(loc_tgt, pos_ind)
            loc_tgt.stop_gradient = True

            if self.loss_rpn_bbox is None:
                loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum()
            else:
                loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum()

        return {
            'loss_rpn_cls': loss_rpn_cls / norm,
            'loss_rpn_reg': loss_rpn_reg / norm
        }


================================================
FILE: ppdet/modeling/proposal_generator/target.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
from ..bbox_utils import bbox2delta, bbox_overlaps


def rpn_anchor_target(anchors,
                      gt_boxes,
                      rpn_batch_size_per_im,
                      rpn_positive_overlap,
                      rpn_negative_overlap,
                      rpn_fg_fraction,
                      use_random=True,
                      batch_size=1,
                      ignore_thresh=-1,
                      is_crowd=None,
                      weights=[1., 1., 1., 1.],
                      assign_on_cpu=False):
    tgt_labels = []
    tgt_bboxes = []
    tgt_deltas = []
    for i in range(batch_size):
        gt_bbox = gt_boxes[i]
        is_crowd_i = is_crowd[i] if is_crowd else None
        # Step1: match anchor and gt_bbox
        matches, match_labels = label_box(
            anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True,
            ignore_thresh, is_crowd_i, assign_on_cpu)
        # Step2: sample anchor 
        fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im,
                                            rpn_fg_fraction, 0, use_random)
        # Fill with the ignore label (-1), then set positive and negative labels
        labels = paddle.full(match_labels.shape, -1, dtype='int32')
        if bg_inds.shape[0] > 0:
            labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds))
        if fg_inds.shape[0] > 0:
            labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds))
        # Step3: make output  
        if gt_bbox.shape[0] == 0:
            matched_gt_boxes = paddle.zeros([matches.shape[0], 4])
            tgt_delta = paddle.zeros([matches.shape[0], 4])
        else:
            matched_gt_boxes = paddle.gather(gt_bbox, matches)
            tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights)
            matched_gt_boxes.stop_gradient = True
            tgt_delta.stop_gradient = True
        labels.stop_gradient = True
        tgt_labels.append(labels)
        tgt_bboxes.append(matched_gt_boxes)
        tgt_deltas.append(tgt_delta)

    return tgt_labels, tgt_bboxes, tgt_deltas


def label_box(anchors,
              gt_boxes,
              positive_overlap,
              negative_overlap,
              allow_low_quality,
              ignore_thresh,
              is_crowd=None,
              assign_on_cpu=False):
    if assign_on_cpu:
        device = paddle.device.get_device()
        paddle.set_device("cpu")
        iou = bbox_overlaps(gt_boxes, anchors)
        paddle.set_device(device)

    else:
        iou = bbox_overlaps(gt_boxes, anchors)
    n_gt = gt_boxes.shape[0]
    if n_gt == 0 or is_crowd is None:
        n_gt_crowd = 0
    else:
        n_gt_crowd = paddle.nonzero(is_crowd).shape[0]
    if iou.shape[0] == 0 or n_gt_crowd == n_gt:
        # No truth, assign everything to background
        default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64')
        default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32')
        return default_matches, default_match_labels
    # if ignore_thresh > 0, remove anchor if it is closed to 
    # one of the crowded ground-truth
    if n_gt_crowd > 0:
        N_a = anchors.shape[0]
        ones = paddle.ones([N_a])
        mask = is_crowd * ones

        if ignore_thresh > 0:
            crowd_iou = iou * mask
            valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'),
                                axis=0) > 0).cast('float32')
            iou = iou * (1 - valid) - valid

        # ignore the iou between anchor and crowded ground-truth
        iou = iou * (1 - mask) - mask

    matched_vals, matches = paddle.topk(iou, k=1, axis=0)
    match_labels = paddle.full(matches.shape, -1, dtype='int32')
    # set ignored anchor with iou = -1
    neg_cond = paddle.logical_and(matched_vals > -1,
                                  matched_vals < negative_overlap)
    match_labels = paddle.where(neg_cond,
                                paddle.zeros_like(match_labels), match_labels)
    match_labels = paddle.where(matched_vals >= positive_overlap,
                                paddle.ones_like(match_labels), match_labels)
    if allow_low_quality:
        highest_quality_foreach_gt = iou.max(axis=1, keepdim=True)
        pred_inds_with_highest_quality = paddle.logical_and(
            iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum(
                0, keepdim=True)
        match_labels = paddle.where(pred_inds_with_highest_quality > 0,
                                    paddle.ones_like(match_labels),
                                    match_labels)

    matches = matches.flatten()
    match_labels = match_labels.flatten()

    return matches, match_labels


def subsample_labels(labels,
                     num_samples,
                     fg_fraction,
                     bg_label=0,
                     use_random=True):
    positive = paddle.nonzero(
        paddle.logical_and(labels != -1, labels != bg_label))
    negative = paddle.nonzero(labels == bg_label)

    fg_num = int(num_samples * fg_fraction)
    fg_num = min(positive.numel(), fg_num)
    bg_num = num_samples - fg_num
    bg_num = min(negative.numel(), bg_num)
    if fg_num == 0 and bg_num == 0:
        fg_inds = paddle.zeros([0], dtype='int32')
        bg_inds = paddle.zeros([0], dtype='int32')
        return fg_inds, bg_inds

    # randomly select positive and negative examples

    negative = negative.cast('int32').flatten()
    bg_perm = paddle.randperm(negative.numel(), dtype='int32')
    bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num])
    if use_random:
        bg_inds = paddle.gather(negative, bg_perm)
    else:
        bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num])
    if fg_num == 0:
        fg_inds = paddle.zeros([0], dtype='int32')
        return fg_inds, bg_inds

    positive = positive.cast('int32').flatten()
    fg_perm = paddle.randperm(positive.numel(), dtype='int32')
    fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num])
    if use_random:
        fg_inds = paddle.gather(positive, fg_perm)
    else:
        fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num])

    return fg_inds, bg_inds


def generate_proposal_target(rpn_rois,
                             gt_classes,
                             gt_boxes,
                             batch_size_per_im,
                             fg_fraction,
                             fg_thresh,
                             bg_thresh,
                             num_classes,
                             ignore_thresh=-1.,
                             is_crowd=None,
                             use_random=True,
                             is_cascade=False,
                             cascade_iou=0.5,
                             assign_on_cpu=False,
                             add_gt_as_proposals=True):

    rois_with_gt = []
    tgt_labels = []
    tgt_bboxes = []
    tgt_gt_inds = []
    new_rois_num = []

    # In cascade rcnn, the threshold for foreground and background
    # is used from cascade_iou
    fg_thresh = cascade_iou if is_cascade else fg_thresh
    bg_thresh = cascade_iou if is_cascade else bg_thresh
    for i, rpn_roi in enumerate(rpn_rois):
        gt_bbox = gt_boxes[i]
        is_crowd_i = is_crowd[i] if is_crowd else None
        gt_class = paddle.squeeze(gt_classes[i], axis=-1)

        # Concat RoIs and gt boxes except cascade rcnn or none gt
        if add_gt_as_proposals and gt_bbox.shape[0] > 0:
            bbox = paddle.concat([rpn_roi, gt_bbox])
        else:
            bbox = rpn_roi

        # Step1: label bbox
        matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh,
                                          False, ignore_thresh, is_crowd_i,
                                          assign_on_cpu)
        # Step2: sample bbox 
        sampled_inds, sampled_gt_classes = sample_bbox(
            matches, match_labels, gt_class, batch_size_per_im, fg_fraction,
            num_classes, use_random, is_cascade)

        # Step3: make output 
        rois_per_image = bbox if is_cascade else paddle.gather(bbox,
                                                               sampled_inds)
        sampled_gt_ind = matches if is_cascade else paddle.gather(matches,
                                                                  sampled_inds)
        if gt_bbox.shape[0] > 0:
            sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)
        else:
            num = rois_per_image.shape[0]
            sampled_bbox = paddle.zeros([num, 4], dtype='float32')

        rois_per_image.stop_gradient = True
        sampled_gt_ind.stop_gradient = True
        sampled_bbox.stop_gradient = True
        tgt_labels.append(sampled_gt_classes)
        tgt_bboxes.append(sampled_bbox)
        rois_with_gt.append(rois_per_image)
        tgt_gt_inds.append(sampled_gt_ind)
        new_rois_num.append(paddle.shape(sampled_inds)[0:1])
    new_rois_num = paddle.concat(new_rois_num)
    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num


def sample_bbox(matches,
                match_labels,
                gt_classes,
                batch_size_per_im,
                fg_fraction,
                num_classes,
                use_random=True,
                is_cascade=False):

    n_gt = gt_classes.shape[0]
    if n_gt == 0:
        # No truth, assign everything to background
        gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes
        #return matches, match_labels + num_classes
    else:
        gt_classes = paddle.gather(gt_classes, matches)
        gt_classes = paddle.where(match_labels == 0,
                                  paddle.ones_like(gt_classes) * num_classes,
                                  gt_classes)
        gt_classes = paddle.where(match_labels == -1,
                                  paddle.ones_like(gt_classes) * -1, gt_classes)
    if is_cascade:
        index = paddle.arange(matches.shape[0])
        return index, gt_classes
    rois_per_image = int(batch_size_per_im)

    fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction,
                                        num_classes, use_random)
    if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0:
        # fake output labeled with -1 when all boxes are neither
        # foreground nor background
        sampled_inds = paddle.zeros([1], dtype='int32')
    else:
        sampled_inds = paddle.concat([fg_inds, bg_inds])
    sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)
    return sampled_inds, sampled_gt_classes


def polygons_to_mask(polygons, height, width):
    """
    Convert the polygons to mask format

    Args:
        polygons (list[ndarray]): each array has shape (Nx2,)
        height (int): mask height
        width (int): mask width
    Returns:
        ndarray: a bool mask of shape (height, width)
    """
    import pycocotools.mask as mask_util
    assert len(polygons) > 0, "COCOAPI does not support empty polygons"
    rles = mask_util.frPyObjects(polygons, height, width)
    rle = mask_util.merge(rles)
    return mask_util.decode(rle).astype(np.bool_)


def rasterize_polygons_within_box(poly, box, resolution):
    w, h = box[2] - box[0], box[3] - box[1]
    polygons = [np.asarray(p, dtype=np.float64) for p in poly]
    for p in polygons:
        p[0::2] = p[0::2] - box[0]
        p[1::2] = p[1::2] - box[1]

    ratio_h = resolution / max(h, 0.1)
    ratio_w = resolution / max(w, 0.1)

    if ratio_h == ratio_w:
        for p in polygons:
            p *= ratio_h
    else:
        for p in polygons:
            p[0::2] *= ratio_w
            p[1::2] *= ratio_h

    # 3. Rasterize the polygons with coco api
    mask = polygons_to_mask(polygons, resolution, resolution)
    mask = paddle.to_tensor(mask, dtype='int32')
    return mask


def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
                         num_classes, resolution):
    mask_rois = []
    mask_rois_num = []
    tgt_masks = []
    tgt_classes = []
    mask_index = []
    tgt_weights = []
    for k in range(len(rois)):
        labels_per_im = labels_int32[k]
        # select rois labeled with foreground
        fg_inds = paddle.nonzero(
            paddle.logical_and(labels_per_im != -1, labels_per_im !=
                               num_classes))
        has_fg = True
        # generate fake roi if foreground is empty
        if fg_inds.numel() == 0:
            has_fg = False
            fg_inds = paddle.ones([1, 1], dtype='int64')
        inds_per_im = sampled_gt_inds[k]
        inds_per_im = paddle.gather(inds_per_im, fg_inds)

        rois_per_im = rois[k]
        fg_rois = paddle.gather(rois_per_im, fg_inds)
        # Copy the foreground roi to cpu
        # to generate mask target with ground-truth
        boxes = fg_rois.numpy()
        gt_segms_per_im = gt_segms[k]

        new_segm = []
        inds_per_im = inds_per_im.numpy()
        if len(gt_segms_per_im) > 0:
            for i in inds_per_im:
                new_segm.append(gt_segms_per_im[i])
        fg_inds_new = fg_inds.reshape([-1]).numpy()
        results = []
        if len(gt_segms_per_im) > 0:
            for j in range(fg_inds_new.shape[0]):
                results.append(
                    rasterize_polygons_within_box(new_segm[j], boxes[j],
                                                  resolution))
        else:
            results.append(paddle.ones([resolution, resolution], dtype='int32'))

        fg_classes = paddle.gather(labels_per_im, fg_inds)
        weight = paddle.ones([fg_rois.shape[0]], dtype='float32')
        if not has_fg:
            # now all sampled classes are background
            # which will cause error in loss calculation,
            # make fake classes with weight of 0.
            fg_classes = paddle.zeros([1], dtype='int32')
            weight = weight - 1
        tgt_mask = paddle.stack(results)
        tgt_mask.stop_gradient = True
        fg_rois.stop_gradient = True

        mask_index.append(fg_inds)
        mask_rois.append(fg_rois)
        mask_rois_num.append(paddle.shape(fg_rois)[0:1])
        tgt_classes.append(fg_classes)
        tgt_masks.append(tgt_mask)
        tgt_weights.append(weight)

    mask_index = paddle.concat(mask_index)
    mask_rois_num = paddle.concat(mask_rois_num)
    tgt_classes = paddle.concat(tgt_classes, axis=0)
    tgt_masks = paddle.concat(tgt_masks, axis=0)
    tgt_weights = paddle.concat(tgt_weights, axis=0)

    return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights


def libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected):
    if len(pos_inds) <= num_expected:
        return pos_inds
    else:
        unique_gt_inds = np.unique(max_classes[pos_inds])
        num_gts = len(unique_gt_inds)
        num_per_gt = int(round(num_expected / float(num_gts)) + 1)

        sampled_inds = []
        for i in unique_gt_inds:
            inds = np.nonzero(max_classes == i)[0]
            before_len = len(inds)
            inds = list(set(inds) & set(pos_inds))
            after_len = len(inds)
            if len(inds) > num_per_gt:
                inds = np.random.choice(inds, size=num_per_gt, replace=False)
            sampled_inds.extend(list(inds))  # combine as a new sampler
        if len(sampled_inds) < num_expected:
            num_extra = num_expected - len(sampled_inds)
            extra_inds = np.array(list(set(pos_inds) - set(sampled_inds)))
            assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \
                "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format(
                    len(sampled_inds), len(extra_inds), len(pos_inds))
            if len(extra_inds) > num_extra:
                extra_inds = np.random.choice(
                    extra_inds, size=num_extra, replace=False)
            sampled_inds.extend(extra_inds.tolist())
        elif len(sampled_inds) > num_expected:
            sampled_inds = np.random.choice(
                sampled_inds, size=num_expected, replace=False)
        return paddle.to_tensor(sampled_inds)


def libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr,
                              num_bins, bg_thresh):
    max_iou = max_overlaps.max()
    iou_interval = (max_iou - floor_thr) / num_bins
    per_num_expected = int(num_expected / num_bins)

    sampled_inds = []
    for i in range(num_bins):
        start_iou = floor_thr + i * iou_interval
        end_iou = floor_thr + (i + 1) * iou_interval

        tmp_set = set(
            np.where(
                np.logical_and(max_overlaps >= start_iou, max_overlaps <
                               end_iou))[0])
        tmp_inds = list(tmp_set & full_set)

        if len(tmp_inds) > per_num_expected:
            tmp_sampled_set = np.random.choice(
                tmp_inds, size=per_num_expected, replace=False)
        else:
            tmp_sampled_set = np.array(tmp_inds, dtype=np.int32)
        sampled_inds.append(tmp_sampled_set)

    sampled_inds = np.concatenate(sampled_inds)
    if len(sampled_inds) < num_expected:
        num_extra = num_expected - len(sampled_inds)
        extra_inds = np.array(list(full_set - set(sampled_inds)))
        assert len(sampled_inds) + len(extra_inds) == len(full_set), \
            "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format(
                len(sampled_inds), len(extra_inds), len(full_set))

        if len(extra_inds) > num_extra:
            extra_inds = np.random.choice(extra_inds, num_extra, replace=False)
        sampled_inds = np.concatenate([sampled_inds, extra_inds])

    return sampled_inds


def libra_sample_neg(max_overlaps,
                     max_classes,
                     neg_inds,
                     num_expected,
                     floor_thr=-1,
                     floor_fraction=0,
                     num_bins=3,
                     bg_thresh=0.5):
    if len(neg_inds) <= num_expected:
        return neg_inds
    else:
        # balance sampling for negative samples
        neg_set = set(neg_inds.tolist())
        if floor_thr > 0:
            floor_set = set(
                np.where(
                    np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr))
                [0])
            iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0])
        elif floor_thr == 0:
            floor_set = set(np.where(max_overlaps == 0)[0])
            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])
        else:
            floor_set = set()
            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])
            floor_thr = 0

        floor_neg_inds = list(floor_set & neg_set)
        iou_sampling_neg_inds = list(iou_sampling_set & neg_set)

        num_expected_iou_sampling = int(num_expected * (1 - floor_fraction))
        if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
            if num_bins >= 2:
                iou_sampled_inds = libra_sample_via_interval(
                    max_overlaps,
                    set(iou_sampling_neg_inds), num_expected_iou_sampling,
                    floor_thr, num_bins, bg_thresh)
            else:
                iou_sampled_inds = np.random.choice(
                    iou_sampling_neg_inds,
                    size=num_expected_iou_sampling,
                    replace=False)
        else:
            iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int32)
        num_expected_floor = num_expected - len(iou_sampled_inds)
        if len(floor_neg_inds) > num_expected_floor:
            sampled_floor_inds = np.random.choice(
                floor_neg_inds, size=num_expected_floor, replace=False)
        else:
            sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int32)
        sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds))
        if len(sampled_inds) < num_expected:
            num_extra = num_expected - len(sampled_inds)
            extra_inds = np.array(list(neg_set - set(sampled_inds)))
            if len(extra_inds) > num_extra:
                extra_inds = np.random.choice(
                    extra_inds, size=num_extra, replace=False)
            sampled_inds = np.concatenate((sampled_inds, extra_inds))
        return paddle.to_tensor(sampled_inds)


def libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap,
                    negative_overlap, num_classes):
    # TODO: use paddle API to speed up
    gt_classes = gt_classes.numpy()
    gt_overlaps = np.zeros((anchors.shape[0], num_classes))
    matches = np.zeros((anchors.shape[0]), dtype=np.int32)
    if len(gt_boxes) > 0:
        proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy()
        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
        # Boxes which with non-zero overlap with gt boxes
        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
            overlapped_boxes_ind]]

        for idx in range(len(overlapped_boxes_ind)):
            gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[
                idx]] = overlaps_max[overlapped_boxes_ind[idx]]
            matches[overlapped_boxes_ind[idx]] = overlaps_argmax[
                overlapped_boxes_ind[idx]]

    gt_overlaps = paddle.to_tensor(gt_overlaps)
    matches = paddle.to_tensor(matches)

    matched_vals = paddle.max(gt_overlaps, axis=1)
    match_labels = paddle.full(matches.shape, -1, dtype='int32')
    match_labels = paddle.where(matched_vals < negative_overlap,
                                paddle.zeros_like(match_labels), match_labels)
    match_labels = paddle.where(matched_vals >= positive_overlap,
                                paddle.ones_like(match_labels), match_labels)

    return matches, match_labels, matched_vals


def libra_sample_bbox(matches,
                      match_labels,
                      matched_vals,
                      gt_classes,
                      batch_size_per_im,
                      num_classes,
                      fg_fraction,
                      fg_thresh,
                      bg_thresh,
                      num_bins,
                      use_random=True,
                      is_cascade_rcnn=False):
    rois_per_image = int(batch_size_per_im)
    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
    bg_rois_per_im = rois_per_image - fg_rois_per_im

    if is_cascade_rcnn:
        fg_inds = paddle.nonzero(matched_vals >= fg_thresh)
        bg_inds = paddle.nonzero(matched_vals < bg_thresh)
    else:
        matched_vals_np = matched_vals.numpy()
        match_labels_np = match_labels.numpy()

        # sample fg
        fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten()
        fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0]))
        if (fg_inds.shape[0] > fg_nums) and use_random:
            fg_inds = libra_sample_pos(matched_vals_np, match_labels_np,
                                       fg_inds.numpy(), fg_rois_per_im)
        fg_inds = fg_inds[:fg_nums]

        # sample bg
        bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten()
        bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0]))
        if (bg_inds.shape[0] > bg_nums) and use_random:
            bg_inds = libra_sample_neg(
                matched_vals_np,
                match_labels_np,
                bg_inds.numpy(),
                bg_rois_per_im,
                num_bins=num_bins,
                bg_thresh=bg_thresh)
        bg_inds = bg_inds[:bg_nums]

        sampled_inds = paddle.concat([fg_inds, bg_inds])

        gt_classes = paddle.gather(gt_classes, matches)
        gt_classes = paddle.where(match_labels == 0,
                                  paddle.ones_like(gt_classes) * num_classes,
                                  gt_classes)
        gt_classes = paddle.where(match_labels == -1,
                                  paddle.ones_like(gt_classes) * -1, gt_classes)
        sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)

        return sampled_inds, sampled_gt_classes


def libra_generate_proposal_target(rpn_rois,
                                   gt_classes,
                                   gt_boxes,
                                   batch_size_per_im,
                                   fg_fraction,
                                   fg_thresh,
                                   bg_thresh,
                                   num_classes,
                                   use_random=True,
                                   is_cascade_rcnn=False,
                                   max_overlaps=None,
                                   num_bins=3):

    rois_with_gt = []
    tgt_labels = []
    tgt_bboxes = []
    sampled_max_overlaps = []
    tgt_gt_inds = []
    new_rois_num = []

    for i, rpn_roi in enumerate(rpn_rois):
        max_overlap = max_overlaps[i] if is_cascade_rcnn else None
        gt_bbox = gt_boxes[i]
        gt_class = paddle.squeeze(gt_classes[i], axis=-1)
        if is_cascade_rcnn:
            rpn_roi = filter_roi(rpn_roi, max_overlap)
        bbox = paddle.concat([rpn_roi, gt_bbox])

        # Step1: label bbox
        matches, match_labels, matched_vals = libra_label_box(
            bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes)

        # Step2: sample bbox
        sampled_inds, sampled_gt_classes = libra_sample_bbox(
            matches, match_labels, matched_vals, gt_class, batch_size_per_im,
            num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins,
            use_random, is_cascade_rcnn)

        # Step3: make output
        rois_per_image = paddle.gather(bbox, sampled_inds)
        sampled_gt_ind = paddle.gather(matches, sampled_inds)
        sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)
        sampled_overlap = paddle.gather(matched_vals, sampled_inds)

        rois_per_image.stop_gradient = True
        sampled_gt_ind.stop_gradient = True
        sampled_bbox.stop_gradient = True
        sampled_overlap.stop_gradient = True

        tgt_labels.append(sampled_gt_classes)
        tgt_bboxes.append(sampled_bbox)
        rois_with_gt.append(rois_per_image)
        sampled_max_overlaps.append(sampled_overlap)
        tgt_gt_inds.append(sampled_gt_ind)
        new_rois_num.append(paddle.shape(sampled_inds)[0:1])
    new_rois_num = paddle.concat(new_rois_num)
    # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num
    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num


================================================
FILE: ppdet/modeling/proposal_generator/target_layer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
import sys
import paddle
from ppdet.core.workspace import register, serializable

from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target
import numpy as np


@register
@serializable
class RPNTargetAssign(object):
    __shared__ = ['assign_on_cpu']
    """
    RPN targets assignment module

    The assignment consists of three steps:
        1. Match anchor and ground-truth box, label the anchor with foreground
           or background sample
        2. Sample anchors to keep the properly ratio between foreground and 
           background
        3. Generate the targets for classification and regression branch


    Args:
        batch_size_per_im (int): Total number of RPN samples per image. 
            default 256
        fg_fraction (float): Fraction of anchors that is labeled
            foreground, default 0.5
        positive_overlap (float): Minimum overlap required between an anchor
            and ground-truth box for the (anchor, gt box) pair to be 
            a foreground sample. default 0.7
        negative_overlap (float): Maximum overlap allowed between an anchor
            and ground-truth box for the (anchor, gt box) pair to be 
            a background sample. default 0.3
        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth
            if the value is larger than zero.
        use_random (bool): Use random sampling to choose foreground and 
            background boxes, default true.
        assign_on_cpu (bool): In case the number of gt box is too large, 
            compute IoU on CPU, default false.
    """

    def __init__(self,
                 batch_size_per_im=256,
                 fg_fraction=0.5,
                 positive_overlap=0.7,
                 negative_overlap=0.3,
                 ignore_thresh=-1.,
                 use_random=True,
                 assign_on_cpu=False):
        super(RPNTargetAssign, self).__init__()
        self.batch_size_per_im = batch_size_per_im
        self.fg_fraction = fg_fraction
        self.positive_overlap = positive_overlap
        self.negative_overlap = negative_overlap
        self.ignore_thresh = ignore_thresh
        self.use_random = use_random
        self.assign_on_cpu = assign_on_cpu

    def __call__(self, inputs, anchors):
        """
        inputs: ground-truth instances.
        anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps.
        """
        gt_boxes = inputs['gt_bbox']
        is_crowd = inputs.get('is_crowd', None)
        batch_size = len(gt_boxes)
        tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target(
            anchors,
            gt_boxes,
            self.batch_size_per_im,
            self.positive_overlap,
            self.negative_overlap,
            self.fg_fraction,
            self.use_random,
            batch_size,
            self.ignore_thresh,
            is_crowd,
            assign_on_cpu=self.assign_on_cpu)
        norm = self.batch_size_per_im * batch_size

        return tgt_labels, tgt_bboxes, tgt_deltas, norm


@register
class BBoxAssigner(object):
    __shared__ = ['num_classes', 'assign_on_cpu']
    """
    RCNN targets assignment module

    The assignment consists of three steps:
        1. Match RoIs and ground-truth box, label the RoIs with foreground
           or background sample
        2. Sample anchors to keep the properly ratio between foreground and 
           background
        3. Generate the targets for classification and regression branch

    Args:
        batch_size_per_im (int): Total number of RoIs per image. 
            default 512 
        fg_fraction (float): Fraction of RoIs that is labeled
            foreground, default 0.25
        fg_thresh (float): Minimum overlap required between a RoI
            and ground-truth box for the (roi, gt box) pair to be
            a foreground sample. default 0.5
        bg_thresh (float): Maximum overlap allowed between a RoI
            and ground-truth box for the (roi, gt box) pair to be
            a background sample. default 0.5
        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth
            if the value is larger than zero.
        use_random (bool): Use random sampling to choose foreground and 
            background boxes, default true
        cascade_iou (list[iou]): The list of overlap to select foreground and
            background of each stage, which is only used In Cascade RCNN.
        num_classes (int): The number of class.
        assign_on_cpu (bool): In case the number of gt box is too large, 
            compute IoU on CPU, default false.
    """

    def __init__(self,
                 batch_size_per_im=512,
                 fg_fraction=.25,
                 fg_thresh=.5,
                 bg_thresh=.5,
                 ignore_thresh=-1.,
                 use_random=True,
                 cascade_iou=[0.5, 0.6, 0.7],
                 num_classes=80,
                 assign_on_cpu=False):
        super(BBoxAssigner, self).__init__()
        self.batch_size_per_im = batch_size_per_im
        self.fg_fraction = fg_fraction
        self.fg_thresh = fg_thresh
        self.bg_thresh = bg_thresh
        self.ignore_thresh = ignore_thresh
        self.use_random = use_random
        self.cascade_iou = cascade_iou
        self.num_classes = num_classes
        self.assign_on_cpu = assign_on_cpu

    def __call__(self,
                 rpn_rois,
                 rpn_rois_num,
                 inputs,
                 stage=0,
                 is_cascade=False,
                 add_gt_as_proposals=True):
        gt_classes = inputs['gt_class']
        gt_boxes = inputs['gt_bbox']
        is_crowd = inputs.get('is_crowd', None)
        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds
        # new_rois_num
        outs = generate_proposal_target(
            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,
            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,
            self.ignore_thresh, is_crowd, self.use_random, is_cascade,
            self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals)
        rois = outs[0]
        rois_num = outs[-1]
        # tgt_labels, tgt_bboxes, tgt_gt_inds
        targets = outs[1:4]
        return rois, rois_num, targets


@register
class BBoxLibraAssigner(object):
    __shared__ = ['num_classes']
    """
    Libra-RCNN targets assignment module

    The assignment consists of three steps:
        1. Match RoIs and ground-truth box, label the RoIs with foreground
           or background sample
        2. Sample anchors to keep the properly ratio between foreground and
           background
        3. Generate the targets for classification and regression branch

    Args:
        batch_size_per_im (int): Total number of RoIs per image.
            default 512
        fg_fraction (float): Fraction of RoIs that is labeled
            foreground, default 0.25
        fg_thresh (float): Minimum overlap required between a RoI
            and ground-truth box for the (roi, gt box) pair to be
            a foreground sample. default 0.5
        bg_thresh (float): Maximum overlap allowed between a RoI
            and ground-truth box for the (roi, gt box) pair to be
            a background sample. default 0.5
        use_random (bool): Use random sampling to choose foreground and
            background boxes, default true
        cascade_iou (list[iou]): The list of overlap to select foreground and
            background of each stage, which is only used In Cascade RCNN.
        num_classes (int): The number of class.
        num_bins (int): The number of libra_sample.
    """

    def __init__(self,
                 batch_size_per_im=512,
                 fg_fraction=.25,
                 fg_thresh=.5,
                 bg_thresh=.5,
                 use_random=True,
                 cascade_iou=[0.5, 0.6, 0.7],
                 num_classes=80,
                 num_bins=3):
        super(BBoxLibraAssigner, self).__init__()
        self.batch_size_per_im = batch_size_per_im
        self.fg_fraction = fg_fraction
        self.fg_thresh = fg_thresh
        self.bg_thresh = bg_thresh
        self.use_random = use_random
        self.cascade_iou = cascade_iou
        self.num_classes = num_classes
        self.num_bins = num_bins

    def __call__(self,
                 rpn_rois,
                 rpn_rois_num,
                 inputs,
                 stage=0,
                 is_cascade=False):
        gt_classes = inputs['gt_class']
        gt_boxes = inputs['gt_bbox']
        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds
        outs = libra_generate_proposal_target(
            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,
            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,
            self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins)
        rois = outs[0]
        rois_num = outs[-1]
        # tgt_labels, tgt_bboxes, tgt_gt_inds
        targets = outs[1:4]
        return rois, rois_num, targets


@register
@serializable
class MaskAssigner(object):
    __shared__ = ['num_classes', 'mask_resolution']
    """
    Mask targets assignment module

    The assignment consists of three steps:
        1. Select RoIs labels with foreground.
        2. Encode the RoIs and corresponding gt polygons to generate 
           mask target

    Args:
        num_classes (int): The number of class
        mask_resolution (int): The resolution of mask target, default 14
    """

    def __init__(self, num_classes=80, mask_resolution=14):
        super(MaskAssigner, self).__init__()
        self.num_classes = num_classes
        self.mask_resolution = mask_resolution

    def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs):
        gt_segms = inputs['gt_poly']

        outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds,
                                    self.num_classes, self.mask_resolution)

        # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights
        return outs


@register
class RBoxAssigner(object):
    """
    assigner of rbox
    Args:
        pos_iou_thr (float): threshold of pos samples
        neg_iou_thr (float): threshold of neg samples
        min_iou_thr (float): the min threshold of samples
        ignore_iof_thr (int): the ignored threshold
    """

    def __init__(self,
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.4,
                 min_iou_thr=0.0,
                 ignore_iof_thr=-2):
        super(RBoxAssigner, self).__init__()

        self.pos_iou_thr = pos_iou_thr
        self.neg_iou_thr = neg_iou_thr
        self.min_iou_thr = min_iou_thr
        self.ignore_iof_thr = ignore_iof_thr

    def anchor_valid(self, anchors):
        """

        Args:
            anchor: M x 4

        Returns:

        """
        if anchors.ndim == 3:
            anchors = anchors.reshape(-1, anchors.shape[-1])
        assert anchors.ndim == 2
        anchor_num = anchors.shape[0]
        anchor_valid = np.ones((anchor_num), np.int32)
        anchor_inds = np.arange(anchor_num)
        return anchor_inds

    def rbox2delta(self,
                   proposals,
                   gt,
                   means=[0, 0, 0, 0, 0],
                   stds=[1, 1, 1, 1, 1]):
        """
        Args:
            proposals: tensor [N, 5]
            gt: gt [N, 5]
            means: means [5]
            stds: stds [5]
        Returns:

        """
        proposals = proposals.astype(np.float64)

        PI = np.pi

        gt_widths = gt[..., 2]
        gt_heights = gt[..., 3]
        gt_angle = gt[..., 4]

        proposals_widths = proposals[..., 2]
        proposals_heights = proposals[..., 3]
        proposals_angle = proposals[..., 4]

        coord = gt[..., 0:2] - proposals[..., 0:2]
        dx = (np.cos(proposals[..., 4]) * coord[..., 0] +
              np.sin(proposals[..., 4]) * coord[..., 1]) / proposals_widths
        dy = (-np.sin(proposals[..., 4]) * coord[..., 0] +
              np.cos(proposals[..., 4]) * coord[..., 1]) / proposals_heights
        dw = np.log(gt_widths / proposals_widths)
        dh = np.log(gt_heights / proposals_heights)
        da = (gt_angle - proposals_angle)

        da = (da + PI / 4) % PI - PI / 4
        da /= PI

        deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
        means = np.array(means, dtype=deltas.dtype)
        stds = np.array(stds, dtype=deltas.dtype)
        deltas = (deltas - means) / stds
        deltas = deltas.astype(np.float32)
        return deltas

    def assign_anchor(self,
                      anchors,
                      gt_bboxes,
                      gt_labels,
                      pos_iou_thr,
                      neg_iou_thr,
                      min_iou_thr=0.0,
                      ignore_iof_thr=-2):
        assert anchors.shape[1] == 4 or anchors.shape[1] == 5
        assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5
        anchors_xc_yc = anchors
        gt_bboxes_xc_yc = gt_bboxes

        # calc rbox iou
        anchors_xc_yc = anchors_xc_yc.astype(np.float32)
        gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32)
        anchors_xc_yc = paddle.to_tensor(anchors_xc_yc)
        gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc)

        try:
            from ext_op import rbox_iou
        except Exception as e:
            print("import custom_ops error, try install ext_op " \
                  "following ppdet/ext_op/README.md", e)
            sys.stdout.flush()
            sys.exit(-1)

        iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc)
        iou = iou.numpy()
        iou = iou.T

        # every gt's anchor's index
        gt_bbox_anchor_inds = iou.argmax(axis=0)
        gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])]
        gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0]

        # every anchor's gt bbox's index
        anchor_gt_bbox_inds = iou.argmax(axis=1)
        anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds]

        # (1) set labels=-2 as default
        labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr

        # (2) assign ignore
        labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr

        # (3) assign neg_ids -1
        assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr
        assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr
        assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2)
        labels[assign_neg_ids] = -1

        # anchor_gt_bbox_iou_inds
        # (4) assign max_iou as pos_ids >=0
        anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds]
        # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr)
        labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds]

        # (5) assign >= pos_iou_thr as pos_ids
        iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr
        iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids]
        labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds]
        return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels

    def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd):

        assert anchors.ndim == 2
        assert anchors.shape[1] == 5
        assert gt_bboxes.ndim == 2
        assert gt_bboxes.shape[1] == 5

        pos_iou_thr = self.pos_iou_thr
        neg_iou_thr = self.neg_iou_thr
        min_iou_thr = self.min_iou_thr
        ignore_iof_thr = self.ignore_iof_thr

        anchor_num = anchors.shape[0]

        gt_bboxes = gt_bboxes
        is_crowd_slice = is_crowd
        not_crowd_inds = np.where(is_crowd_slice == 0)

        # Step1: match anchor and gt_bbox
        anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor(
            anchors, gt_bboxes,
            gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr,
            ignore_iof_thr)

        # Step2: sample anchor
        pos_inds = np.where(labels >= 0)[0]
        neg_inds = np.where(labels == -1)[0]

        # Step3: make output
        anchors_num = anchors.shape[0]
        bbox_targets = np.zeros_like(anchors)
        bbox_weights = np.zeros_like(anchors)
        bbox_gt_bboxes = np.zeros_like(anchors)
        pos_labels = np.zeros(anchors_num, dtype=np.int32)
        pos_labels_weights = np.zeros(anchors_num, dtype=np.float32)

        pos_sampled_anchors = anchors[pos_inds]
        pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]]
        if len(pos_inds) > 0:
            pos_bbox_targets = self.rbox2delta(pos_sampled_anchors,
                                               pos_sampled_gt_boxes)
            bbox_targets[pos_inds, :] = pos_bbox_targets
            bbox_gt_bboxes[pos_inds, :] = pos_sampled_gt_boxes
            bbox_weights[pos_inds, :] = 1.0

            pos_labels[pos_inds] = labels[pos_inds]
            pos_labels_weights[pos_inds] = 1.0

        if len(neg_inds) > 0:
            pos_labels_weights[neg_inds] = 1.0
        return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights,
                bbox_gt_bboxes, pos_inds, neg_inds)


================================================
FILE: ppdet/modeling/rbox_utils.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import numpy as np
import cv2


def norm_angle(angle, range=[-np.pi / 4, np.pi]):
    return (angle - range[0]) % range[1] + range[0]


# rbox function implemented using numpy
def poly2rbox_le135_np(poly):
    """convert poly to rbox [-pi / 4, 3 * pi / 4]

    Args:
        poly: [x1, y1, x2, y2, x3, y3, x4, y4]

    Returns:
        rbox: [cx, cy, w, h, angle]
    """
    poly = np.array(poly[:8], dtype=np.float32)

    pt1 = (poly[0], poly[1])
    pt2 = (poly[2], poly[3])
    pt3 = (poly[4], poly[5])
    pt4 = (poly[6], poly[7])

    edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) *
                    (pt1[1] - pt2[1]))
    edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) *
                    (pt2[1] - pt3[1]))

    width = max(edge1, edge2)
    height = min(edge1, edge2)

    rbox_angle = 0
    if edge1 > edge2:
        rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0]))
    elif edge2 >= edge1:
        rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0]))

    rbox_angle = norm_angle(rbox_angle)

    x_ctr = float(pt1[0] + pt3[0]) / 2
    y_ctr = float(pt1[1] + pt3[1]) / 2
    return [x_ctr, y_ctr, width, height, rbox_angle]


def poly2rbox_oc_np(poly):
    """convert poly to rbox (0, pi / 2]

    Args:
        poly: [x1, y1, x2, y2, x3, y3, x4, y4]

    Returns:
        rbox: [cx, cy, w, h, angle]
    """
    points = np.array(poly, dtype=np.float32).reshape((-1, 2))
    (cx, cy), (w, h), angle = cv2.minAreaRect(points)
    # using the new OpenCV Rotated BBox definition since 4.5.1
    # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0)
    if angle < 0:
        angle += 90
        w, h = h, w

    # convert angle to [0, 90)
    if angle == -0.0:
        angle = 0.0
    if angle == 90.0:
        angle = 0.0
        w, h = h, w

    angle = angle / 180 * np.pi
    return [cx, cy, w, h, angle]


def poly2rbox_np(polys, rbox_type='oc'):
    """
    polys: [x0,y0,x1,y1,x2,y2,x3,y3]
    to
    rboxes: [x_ctr,y_ctr,w,h,angle]
    """
    assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now'
    poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np
    rboxes = []
    for poly in polys:
        x, y, w, h, angle = poly2rbox_fn(poly)
        rbox = np.array([x, y, w, h, angle], dtype=np.float32)
        rboxes.append(rbox)

    return np.array(rboxes)


def cal_line_length(point1, point2):
    return math.sqrt(
        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))


def get_best_begin_point_single(coordinate):
    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
    xmin = min(x1, x2, x3, x4)
    ymin = min(y1, y2, y3, y4)
    xmax = max(x1, x2, x3, x4)
    ymax = max(y1, y2, y3, y4)
    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
    force = 100000000.0
    force_flag = 0
    for i in range(4):
        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \
                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \
                     + cal_line_length(combinate[i][3], dst_coordinate[3])
        if temp_force < force:
            force = temp_force
            force_flag = i
    if force_flag != 0:
        pass
    return np.array(combinate[force_flag]).reshape(8)


def rbox2poly_np(rboxes):
    """
    rboxes:[x_ctr,y_ctr,w,h,angle]
    to
    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
    """
    polys = []
    for i in range(len(rboxes)):
        x_ctr, y_ctr, width, height, angle = rboxes[i][:5]
        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
        R = np.array([[np.cos(angle), -np.sin(angle)],
                      [np.sin(angle), np.cos(angle)]])
        poly = R.dot(rect)
        x0, x1, x2, x3 = poly[0, :4] + x_ctr
        y0, y1, y2, y3 = poly[1, :4] + y_ctr
        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
        poly = get_best_begin_point_single(poly)
        polys.append(poly)
    polys = np.array(polys)
    return polys


# rbox function implemented using paddle
def box2corners(box):
    """convert box coordinate to corners
    Args:
        box (Tensor): (B, N, 5) with (x, y, w, h, alpha) angle is in [0, 90)
    Returns:
        corners (Tensor): (B, N, 4, 2) with (x1, y1, x2, y2, x3, y3, x4, y4)
    """
    B = box.shape[0]
    x, y, w, h, alpha = paddle.split(box, 5, axis=-1)
    x4 = paddle.to_tensor(
        [0.5, 0.5, -0.5, -0.5], dtype=paddle.float32).reshape(
            (1, 1, 4))  # (1,1,4)
    x4 = x4 * w  # (B, N, 4)
    y4 = paddle.to_tensor(
        [-0.5, 0.5, 0.5, -0.5], dtype=paddle.float32).reshape((1, 1, 4))
    y4 = y4 * h  # (B, N, 4)
    corners = paddle.stack([x4, y4], axis=-1)  # (B, N, 4, 2)
    sin = paddle.sin(alpha)
    cos = paddle.cos(alpha)
    row1 = paddle.concat([cos, sin], axis=-1)
    row2 = paddle.concat([-sin, cos], axis=-1)  # (B, N, 2)
    rot_T = paddle.stack([row1, row2], axis=-2)  # (B, N, 2, 2)
    rotated = paddle.bmm(corners.reshape([-1, 4, 2]), rot_T.reshape([-1, 2, 2]))
    rotated = rotated.reshape([B, -1, 4, 2])  # (B*N, 4, 2) -> (B, N, 4, 2)
    rotated[..., 0] += x
    rotated[..., 1] += y
    return rotated


def paddle_gather(x, dim, index):
    index_shape = index.shape
    index_flatten = index.flatten()
    if dim < 0:
        dim = len(x.shape) + dim
    nd_index = []
    for k in range(len(x.shape)):
        if k == dim:
            nd_index.append(index_flatten)
        else:
            reshape_shape = [1] * len(x.shape)
            reshape_shape[k] = x.shape[k]
            x_arange = paddle.arange(x.shape[k], dtype=index.dtype)
            x_arange = x_arange.reshape(reshape_shape)
            dim_index = paddle.expand(x_arange, index_shape).flatten()
            nd_index.append(dim_index)
    ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64")
    paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)
    return paddle_out


def check_points_in_polys(points, polys):
    """Check whether point is in rotated boxes
    Args:
        points (tensor): (1, L, 2) anchor points
        polys (tensor): [B, N, 4, 2] gt_polys
        eps (float): default 1e-9
    Returns:
        is_in_polys (tensor): (B, N, L)
    """
    # [1, L, 2] -> [1, 1, L, 2]
    points = points.unsqueeze(0)
    # [B, N, 4, 2] -> [B, N, 1, 2]
    a, b, c, d = polys.split(4, axis=2)
    ab = b - a
    ad = d - a
    # [B, N, L, 2]
    ap = points - a
    # [B, N, 1]
    norm_ab = paddle.sum(ab * ab, axis=-1)
    # [B, N, 1]
    norm_ad = paddle.sum(ad * ad, axis=-1)
    # [B, N, L] dot product
    ap_dot_ab = paddle.sum(ap * ab, axis=-1)
    # [B, N, L] dot product
    ap_dot_ad = paddle.sum(ap * ad, axis=-1)
    # [B, N, L] <A, B> = |A|*|B|*cos(theta)
    is_in_polys = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (
        ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad)
    return is_in_polys


def check_points_in_rotated_boxes(points, boxes):
    """Check whether point is in rotated boxes

    Args:
        points (tensor): (1, L, 2) anchor points
        boxes (tensor): [B, N, 5] gt_bboxes
        eps (float): default 1e-9
    
    Returns:
        is_in_box (tensor): (B, N, L)

    """
    # [B, N, 5] -> [B, N, 4, 2]
    corners = box2corners(boxes)
    # [1, L, 2] -> [1, 1, L, 2]
    points = points.unsqueeze(0)
    # [B, N, 4, 2] -> [B, N, 1, 2]
    a, b, c, d = corners.split(4, axis=2)
    ab = b - a
    ad = d - a
    # [B, N, L, 2]
    ap = points - a
    # [B, N, L]
    norm_ab = paddle.sum(ab * ab, axis=-1)
    # [B, N, L]
    norm_ad = paddle.sum(ad * ad, axis=-1)
    # [B, N, L] dot product
    ap_dot_ab = paddle.sum(ap * ab, axis=-1)
    # [B, N, L] dot product
    ap_dot_ad = paddle.sum(ap * ad, axis=-1)
    # [B, N, L] <A, B> = |A|*|B|*cos(theta) 
    is_in_box = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & (
        ap_dot_ad <= norm_ad)
    return is_in_box


def rotated_iou_similarity(box1, box2, eps=1e-9, func=''):
    """Calculate iou of box1 and box2

    Args:
        box1 (Tensor): box with the shape [N, M1, 5]
        box2 (Tensor): box with the shape [N, M2, 5]

    Return:
        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
    """
    from ext_op import rbox_iou
    rotated_ious = []
    for b1, b2 in zip(box1, box2):
        rotated_ious.append(rbox_iou(b1, b2))

    return paddle.stack(rotated_ious, axis=0)


================================================
FILE: ppdet/modeling/reid/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import jde_embedding_head
from . import fairmot_embedding_head
from . import resnet
from . import pyramidal_embedding
from . import pplcnet_embedding
from . import resnet_embedding

from .fairmot_embedding_head import *
from .jde_embedding_head import *
from .resnet import *
from .pyramidal_embedding import *
from .pplcnet_embedding import *
from .resnet_embedding import *


================================================
FILE: ppdet/modeling/reid/fairmot_embedding_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import numpy as np
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingUniform, Uniform
from ppdet.core.workspace import register
from ppdet.modeling.heads.centernet_head import ConvLayer

__all__ = ['FairMOTEmbeddingHead']


@register
class FairMOTEmbeddingHead(nn.Layer):
    __shared__ = ['num_classes']
    """
    Args:
        in_channels (int): the channel number of input to FairMOTEmbeddingHead.
        ch_head (int): the channel of features before fed into embedding, 256 by default.
        ch_emb (int): the channel of the embedding feature, 128 by default.
        num_identities_dict (dict): the number of identities of each category,
            support single class and multi-calss, {0: 14455} as default. 
    """

    def __init__(self,
                 in_channels,
                 ch_head=256,
                 ch_emb=128,
                 num_classes=1,
                 num_identities_dict={0: 14455}):
        super(FairMOTEmbeddingHead, self).__init__()
        assert num_classes >= 1
        self.num_classes = num_classes
        self.ch_emb = ch_emb
        self.num_identities_dict = num_identities_dict
        self.reid = nn.Sequential(
            ConvLayer(
                in_channels, ch_head, kernel_size=3, padding=1, bias=True),
            nn.ReLU(),
            ConvLayer(
                ch_head, ch_emb, kernel_size=1, stride=1, padding=0, bias=True))
        param_attr = paddle.ParamAttr(initializer=KaimingUniform())
        bound = 1 / math.sqrt(ch_emb)
        bias_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))
        self.reid_loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')

        if num_classes == 1:
            nID = self.num_identities_dict[0]  # single class
            self.classifier = nn.Linear(
                ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)
            # When num_identities(nID) is 1, emb_scale is set as 1
            self.emb_scale = math.sqrt(2) * math.log(nID - 1) if nID > 1 else 1
        else:
            self.classifiers = dict()
            self.emb_scale_dict = dict()
            for cls_id, nID in self.num_identities_dict.items():
                self.classifiers[str(cls_id)] = nn.Linear(
                    ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)
                # When num_identities(nID) is 1, emb_scale is set as 1
                self.emb_scale_dict[str(cls_id)] = math.sqrt(2) * math.log(
                    nID - 1) if nID > 1 else 1

    @classmethod
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
        return {'in_channels': input_shape.channels}

    def process_by_class(self, bboxes, embedding, bbox_inds, topk_clses):
        pred_dets, pred_embs = [], []
        for cls_id in range(self.num_classes):
            inds_masks = topk_clses == cls_id
            inds_masks = paddle.cast(inds_masks, 'float32')

            pos_num = inds_masks.sum().numpy()
            if pos_num == 0:
                continue

            cls_inds_mask = inds_masks > 0

            bbox_mask = paddle.nonzero(cls_inds_mask)
            cls_bboxes = paddle.gather_nd(bboxes, bbox_mask)
            pred_dets.append(cls_bboxes)

            cls_inds = paddle.masked_select(bbox_inds, cls_inds_mask)
            cls_inds = cls_inds.unsqueeze(-1)
            cls_embedding = paddle.gather_nd(embedding, cls_inds)
            pred_embs.append(cls_embedding)

        return paddle.concat(pred_dets), paddle.concat(pred_embs)

    def forward(self,
                neck_feat,
                inputs,
                bboxes=None,
                bbox_inds=None,
                topk_clses=None):
        reid_feat = self.reid(neck_feat)
        if self.training:
            if self.num_classes == 1:
                loss = self.get_loss(reid_feat, inputs)
            else:
                loss = self.get_mc_loss(reid_feat, inputs)
            return loss
        else:
            assert bboxes is not None and bbox_inds is not None
            reid_feat = F.normalize(reid_feat)
            embedding = paddle.transpose(reid_feat, [0, 2, 3, 1])
            embedding = paddle.reshape(embedding, [-1, self.ch_emb])
            # embedding shape: [bs * h * w, ch_emb]

            if self.num_classes == 1:
                pred_dets = bboxes
                pred_embs = paddle.gather(embedding, bbox_inds)
            else:
                pred_dets, pred_embs = self.process_by_class(
                    bboxes, embedding, bbox_inds, topk_clses)
            return pred_dets, pred_embs

    def get_loss(self, feat, inputs):
        index = inputs['index']
        mask = inputs['index_mask']
        target = inputs['reid']
        target = paddle.masked_select(target, mask > 0)
        target = paddle.unsqueeze(target, 1)

        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])
        feat_n, feat_h, feat_w, feat_c = feat.shape
        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])
        index = paddle.unsqueeze(index, 2)
        batch_inds = list()
        for i in range(feat_n):
            batch_ind = paddle.full(
                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
            batch_inds.append(batch_ind)
        batch_inds = paddle.concat(batch_inds, axis=0)
        index = paddle.concat(x=[batch_inds, index], axis=2)
        feat = paddle.gather_nd(feat, index=index)

        mask = paddle.unsqueeze(mask, axis=2)
        mask = paddle.expand_as(mask, feat)
        mask.stop_gradient = True
        feat = paddle.masked_select(feat, mask > 0)
        feat = paddle.reshape(feat, shape=[-1, feat_c])
        feat = F.normalize(feat)
        feat = self.emb_scale * feat
        logit = self.classifier(feat)
        target.stop_gradient = True
        loss = self.reid_loss(logit, target)
        valid = (target != self.reid_loss.ignore_index)
        valid.stop_gradient = True
        count = paddle.sum((paddle.cast(valid, dtype=np.int32)))
        count.stop_gradient = True
        if count > 0:
            loss = loss / count

        return loss

    def get_mc_loss(self, feat, inputs):
        # feat.shape = [bs, ch_emb, h, w]
        assert 'cls_id_map' in inputs and 'cls_tr_ids' in inputs
        index = inputs['index']
        mask = inputs['index_mask']
        cls_id_map = inputs['cls_id_map']  # [bs, h, w]
        cls_tr_ids = inputs['cls_tr_ids']  # [bs, num_classes, h, w]

        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])
        feat_n, feat_h, feat_w, feat_c = feat.shape
        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])

        index = paddle.unsqueeze(index, 2)
        batch_inds = list()
        for i in range(feat_n):
            batch_ind = paddle.full(
                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
            batch_inds.append(batch_ind)
        batch_inds = paddle.concat(batch_inds, axis=0)
        index = paddle.concat(x=[batch_inds, index], axis=2)
        feat = paddle.gather_nd(feat, index=index)

        mask = paddle.unsqueeze(mask, axis=2)
        mask = paddle.expand_as(mask, feat)
        mask.stop_gradient = True
        feat = paddle.masked_select(feat, mask > 0)
        feat = paddle.reshape(feat, shape=[-1, feat_c])

        reid_losses = 0
        for cls_id, id_num in self.num_identities_dict.items():
            # target
            cur_cls_tr_ids = paddle.reshape(
                cls_tr_ids[:, cls_id, :, :], shape=[feat_n, -1])  # [bs, h*w]
            cls_id_target = paddle.gather_nd(cur_cls_tr_ids, index=index)
            mask = inputs['index_mask']
            cls_id_target = paddle.masked_select(cls_id_target, mask > 0)
            cls_id_target.stop_gradient = True

            # feat
            cls_id_feat = self.emb_scale_dict[str(cls_id)] * F.normalize(feat)
            cls_id_pred = self.classifiers[str(cls_id)](cls_id_feat)

            loss = self.reid_loss(cls_id_pred, cls_id_target)
            valid = (cls_id_target != self.reid_loss.ignore_index)
            valid.stop_gradient = True
            count = paddle.sum((paddle.cast(valid, dtype=np.int32)))
            count.stop_gradient = True
            if count > 0:
                loss = loss / count
            reid_losses += loss

        return reid_losses


================================================
FILE: ppdet/modeling/reid/jde_embedding_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from paddle.nn.initializer import Normal, Constant

__all__ = ['JDEEmbeddingHead']


class LossParam(nn.Layer):
    def __init__(self, init_value=0., use_uncertainy=True):
        super(LossParam, self).__init__()
        self.loss_param = self.create_parameter(
            shape=[1],
            attr=ParamAttr(initializer=Constant(value=init_value)),
            dtype="float32")

    def forward(self, inputs):
        out = paddle.exp(-self.loss_param) * inputs + self.loss_param
        return out * 0.5


@register
class JDEEmbeddingHead(nn.Layer):
    __shared__ = ['num_classes']
    __inject__ = ['emb_loss', 'jde_loss']
    """
    JDEEmbeddingHead
    Args:
        num_classes(int): Number of classes. Only support one class tracking.
        num_identities(int): Number of identities.
        anchor_levels(int): Number of anchor levels, same as FPN levels.
        anchor_scales(int): Number of anchor scales on each FPN level.
        embedding_dim(int): Embedding dimension. Default: 512.
        emb_loss(object): Instance of 'JDEEmbeddingLoss'
        jde_loss(object): Instance of 'JDELoss'
    """

    def __init__(
            self,
            num_classes=1,
            num_identities=14455,  # dataset.num_identities_dict[0]
            anchor_levels=3,
            anchor_scales=4,
            embedding_dim=512,
            emb_loss='JDEEmbeddingLoss',
            jde_loss='JDELoss'):
        super(JDEEmbeddingHead, self).__init__()
        self.num_classes = num_classes
        self.num_identities = num_identities
        self.anchor_levels = anchor_levels
        self.anchor_scales = anchor_scales
        self.embedding_dim = embedding_dim
        self.emb_loss = emb_loss
        self.jde_loss = jde_loss

        self.emb_scale = math.sqrt(2) * math.log(
            self.num_identities - 1) if self.num_identities > 1 else 1

        self.identify_outputs = []
        self.loss_params_cls = []
        self.loss_params_reg = []
        self.loss_params_ide = []
        for i in range(self.anchor_levels):
            name = 'identify_output.{}'.format(i)
            identify_output = self.add_sublayer(
                name,
                nn.Conv2D(
                    in_channels=64 * (2**self.anchor_levels) // (2**i),
                    out_channels=self.embedding_dim,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias_attr=ParamAttr(regularizer=L2Decay(0.))))
            self.identify_outputs.append(identify_output)

            loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15))
            self.loss_params_cls.append(loss_p_cls)
            loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85))
            self.loss_params_reg.append(loss_p_reg)
            loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3))
            self.loss_params_ide.append(loss_p_ide)

        self.classifier = self.add_sublayer(
            'classifier',
            nn.Linear(
                self.embedding_dim,
                self.num_identities,
                weight_attr=ParamAttr(
                    learning_rate=1., initializer=Normal(
                        mean=0.0, std=0.01)),
                bias_attr=ParamAttr(
                    learning_rate=2., regularizer=L2Decay(0.))))

    def forward(self,
                identify_feats,
                targets,
                loss_confs=None,
                loss_boxes=None,
                bboxes=None,
                boxes_idx=None,
                nms_keep_idx=None):
        assert self.num_classes == 1, 'JDE only support sindle class MOT.'
        assert len(identify_feats) == self.anchor_levels
        ide_outs = []
        for feat, ide_head in zip(identify_feats, self.identify_outputs):
            ide_outs.append(ide_head(feat))

        if self.training:
            assert len(loss_confs) == len(loss_boxes) == self.anchor_levels
            loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale,
                                      self.classifier)
            jde_losses = self.jde_loss(
                loss_confs, loss_boxes, loss_ides, self.loss_params_cls,
                self.loss_params_reg, self.loss_params_ide, targets)
            return jde_losses
        else:
            assert bboxes is not None
            assert boxes_idx is not None
            assert nms_keep_idx is not None

            emb_outs = self.get_emb_outs(ide_outs)
            emb_valid = paddle.gather_nd(emb_outs, boxes_idx)
            pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx)

            input_shape = targets['image'].shape[2:]
            # input_shape: [h, w], before data transforms, set in model config
            im_shape = targets['im_shape'][0].numpy()
            # im_shape: [new_h, new_w], after data transforms
            scale_factor = targets['scale_factor'][0].numpy()
            bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape,
                                              im_shape, scale_factor)
            # cls_ids, scores, tlwhs 
            pred_dets = bboxes
            return pred_dets, pred_embs

    def scale_coords(self, coords, input_shape, im_shape, scale_factor):
        ratio = scale_factor[0]
        pad_w = (input_shape[1] - int(im_shape[1])) / 2
        pad_h = (input_shape[0] - int(im_shape[0])) / 2
        coords = paddle.cast(coords, 'float32')
        coords[:, 0::2] -= pad_w
        coords[:, 1::2] -= pad_h
        coords[:, 0:4] /= ratio
        coords[:, :4] = paddle.clip(
            coords[:, :4], min=0, max=coords[:, :4].max())
        return coords.round()

    def get_emb_and_gt_outs(self, ide_outs, targets):
        emb_and_gts = []
        for i, p_ide in enumerate(ide_outs):
            t_conf = targets['tconf{}'.format(i)]
            t_ide = targets['tide{}'.format(i)]

            p_ide = p_ide.transpose((0, 2, 3, 1))
            p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim])

            mask = t_conf > 0
            mask = paddle.cast(mask, dtype="int64")
            emb_mask = mask.max(1).flatten()
            emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
            if len(emb_mask_inds) > 0:
                t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1])
                tids = paddle.gather(t_ide_flatten, emb_mask_inds)

                embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
                embedding = self.emb_scale * F.normalize(embedding)
                emb_and_gt = paddle.concat([embedding, tids], axis=1)
                emb_and_gts.append(emb_and_gt)

        if len(emb_and_gts) > 0:
            return paddle.concat(emb_and_gts, axis=0)
        else:
            return paddle.zeros((1, self.embedding_dim + 1))

    def get_emb_outs(self, ide_outs):
        emb_outs = []
        for i, p_ide in enumerate(ide_outs):
            p_ide = p_ide.transpose((0, 2, 3, 1))

            p_ide_repeat = paddle.tile(p_ide, [self.anchor_scales, 1, 1, 1])
            embedding = F.normalize(p_ide_repeat, axis=-1)
            emb = paddle.reshape(embedding, [-1, self.embedding_dim])
            emb_outs.append(emb)

        if len(emb_outs) > 0:
            return paddle.concat(emb_outs, axis=0)
        else:
            return paddle.zeros((1, self.embedding_dim))


================================================
FILE: ppdet/modeling/reid/pplcnet_embedding.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Linear
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal, XavierNormal
from ppdet.core.workspace import register

__all__ = ['PPLCNetEmbedding']


# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.
# k: kernel_size
# in_c: input channel number in depthwise block
# out_c: output channel number in depthwise block
# s: stride in depthwise block
# use_se: whether to use SE block

NET_CONFIG = {
    "blocks2":
    #k, in_c, out_c, s, use_se
    [[3, 16, 32, 1, False]],
    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
    "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],
                [5, 256, 256, 1, False], [5, 256, 256, 1, False],
                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}


def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 num_channels,
                 filter_size,
                 num_filters,
                 stride,
                 num_groups=1):
        super().__init__()

        self.conv = Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=num_groups,
            weight_attr=ParamAttr(initializer=KaimingNormal()),
            bias_attr=False)

        self.bn = BatchNorm2D(
            num_filters,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self.hardswish = nn.Hardswish()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.hardswish(x)
        return x


class DepthwiseSeparable(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride,
                 dw_size=3,
                 use_se=False):
        super().__init__()
        self.use_se = use_se
        self.dw_conv = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_channels,
            filter_size=dw_size,
            stride=stride,
            num_groups=num_channels)
        if use_se:
            self.se = SEModule(num_channels)
        self.pw_conv = ConvBNLayer(
            num_channels=num_channels,
            filter_size=1,
            num_filters=num_filters,
            stride=1)

    def forward(self, x):
        x = self.dw_conv(x)
        if self.use_se:
            x = self.se(x)
        x = self.pw_conv(x)
        return x


class SEModule(nn.Layer):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.avg_pool = AdaptiveAvgPool2D(1)
        self.conv1 = Conv2D(
            in_channels=channel,
            out_channels=channel // reduction,
            kernel_size=1,
            stride=1,
            padding=0)
        self.relu = nn.ReLU()
        self.conv2 = Conv2D(
            in_channels=channel // reduction,
            out_channels=channel,
            kernel_size=1,
            stride=1,
            padding=0)
        self.hardsigmoid = nn.Hardsigmoid()

    def forward(self, x):
        identity = x
        x = self.avg_pool(x)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.hardsigmoid(x)
        x = paddle.multiply(x=identity, y=x)
        return x


class PPLCNet(nn.Layer):
    """
    PP-LCNet, see https://arxiv.org/abs/2109.15099.
    This code is different from PPLCNet in ppdet/modeling/backbones/lcnet.py
    or in PaddleClas, because the output is the flatten feature of last_conv.

    Args:
        scale (float): Scale ratio of channels.
        class_expand (int): Number of channels of conv feature.
    """

    def __init__(self, scale=1.0, class_expand=1280):
        super(PPLCNet, self).__init__()
        self.scale = scale
        self.class_expand = class_expand

        self.conv1 = ConvBNLayer(
            num_channels=3,
            filter_size=3,
            num_filters=make_divisible(16 * scale),
            stride=2)

        self.blocks2 = nn.Sequential(*[
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
        ])

        self.blocks3 = nn.Sequential(*[
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
        ])

        self.blocks4 = nn.Sequential(*[
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
        ])

        self.blocks5 = nn.Sequential(*[
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
        ])

        self.blocks6 = nn.Sequential(*[
            DepthwiseSeparable(
                num_channels=make_divisible(in_c * scale),
                num_filters=make_divisible(out_c * scale),
                dw_size=k,
                stride=s,
                use_se=se)
            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
        ])

        self.avg_pool = AdaptiveAvgPool2D(1)
        self.last_conv = Conv2D(
            in_channels=make_divisible(NET_CONFIG["blocks6"][-1][2] * scale),
            out_channels=self.class_expand,
            kernel_size=1,
            stride=1,
            padding=0,
            bias_attr=False)
        self.hardswish = nn.Hardswish()
        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)

    def forward(self, x):
        x = self.conv1(x)

        x = self.blocks2(x)
        x = self.blocks3(x)
        x = self.blocks4(x)
        x = self.blocks5(x)
        x = self.blocks6(x)

        x = self.avg_pool(x)
        x = self.last_conv(x)
        x = self.hardswish(x)
        x = self.flatten(x)
        return x


class FC(nn.Layer):
    def __init__(self, input_ch, output_ch):
        super(FC, self).__init__()
        weight_attr = ParamAttr(initializer=XavierNormal())
        self.fc = paddle.nn.Linear(input_ch, output_ch, weight_attr=weight_attr)

    def forward(self, x):
        out = self.fc(x)
        return out


@register
class PPLCNetEmbedding(nn.Layer):
    """
    PPLCNet Embedding

    Args:
        input_ch (int): Number of channels of input conv feature.
        output_ch (int): Number of channels of output conv feature.
    """
    def __init__(self, scale=2.5, input_ch=1280, output_ch=512):
        super(PPLCNetEmbedding, self).__init__()
        self.backbone = PPLCNet(scale=scale)
        self.neck = FC(input_ch, output_ch)

    def forward(self, x):
        feat = self.backbone(x)
        feat_out = self.neck(feat)
        return feat_out


================================================
FILE: ppdet/modeling/reid/pyramidal_embedding.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant
from paddle import ParamAttr
from .resnet import ResNet50, ResNet101
from ppdet.core.workspace import register

__all__ = ['PCBPyramid']


@register
class PCBPyramid(nn.Layer):
    """
    PCB (Part-based Convolutional Baseline), see https://arxiv.org/abs/1711.09349,
    Pyramidal Person Re-IDentification, see https://arxiv.org/abs/1810.12193

    Args:
        input_ch (int): Number of channels of the input feature.
        num_stripes (int): Number of sub-parts.
        used_levels (tuple): Whether the level is used, 1 means used.
        num_classes (int): Number of classes for identities, default 751 in
            Market-1501 dataset.
        last_conv_stride (int): Stride of the last conv.
        last_conv_dilation (int): Dilation of the last conv.
        num_conv_out_channels (int): Number of channels of conv feature.
    """

    def __init__(self,
                 input_ch=2048,
                 model_name='ResNet101',
                 num_stripes=6,
                 used_levels=(1, 1, 1, 1, 1, 1),
                 num_classes=751,
                 last_conv_stride=1,
                 last_conv_dilation=1,
                 num_conv_out_channels=128):
        super(PCBPyramid, self).__init__()
        self.num_stripes = num_stripes
        self.used_levels = used_levels
        self.num_classes = num_classes

        self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)]
        self.num_branches = sum(self.num_in_each_level)

        assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name)
        self.base = eval(model_name)(
            lr_mult=0.1,
            last_conv_stride=last_conv_stride,
            last_conv_dilation=last_conv_dilation)
        self.dropout_layer = nn.Dropout(p=0.2)
        self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch(
            num_conv_out_channels, input_ch)

    def basic_branch(self, num_conv_out_channels, input_ch):
        # the level indexes are defined from fine to coarse,
        # the branch will contain one more part than that of its previous level
        # the sliding step is set to 1
        pyramid_conv_list = nn.LayerList()
        pyramid_fc_list = nn.LayerList()

        idx_levels = 0
        for idx_branches in range(self.num_branches):
            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
                idx_levels += 1

            pyramid_conv_list.append(
                nn.Sequential(
                    nn.Conv2D(input_ch, num_conv_out_channels, 1),
                    nn.BatchNorm2D(num_conv_out_channels), nn.ReLU()))

        idx_levels = 0
        for idx_branches in range(self.num_branches):
            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
                idx_levels += 1

            fc = nn.Linear(
                in_features=num_conv_out_channels,
                out_features=self.num_classes,
                weight_attr=ParamAttr(initializer=Normal(
                    mean=0., std=0.001)),
                bias_attr=ParamAttr(initializer=Constant(value=0.)))
            pyramid_fc_list.append(fc)
        return pyramid_conv_list, pyramid_fc_list

    def pyramid_forward(self, feat):
        each_stripe_size = int(feat.shape[2] / self.num_stripes)

        feat_list, logits_list = [], []
        idx_levels = 0
        used_branches = 0
        for idx_branches in range(self.num_branches):
            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
                idx_levels += 1
            idx_in_each_level = idx_branches - sum(self.num_in_each_level[
                0:idx_levels])
            stripe_size_in_each_level = each_stripe_size * (idx_levels + 1)
            start = idx_in_each_level * each_stripe_size
            end = start + stripe_size_in_each_level

            k = feat.shape[-1]
            local_feat_avgpool = F.avg_pool2d(
                feat[:, :, start:end, :],
                kernel_size=(stripe_size_in_each_level, k))
            local_feat_maxpool = F.max_pool2d(
                feat[:, :, start:end, :],
                kernel_size=(stripe_size_in_each_level, k))
            local_feat = local_feat_avgpool + local_feat_maxpool

            local_feat = self.pyramid_conv_list0[used_branches](local_feat)
            local_feat = paddle.reshape(
                local_feat, shape=[local_feat.shape[0], -1])
            feat_list.append(local_feat)

            local_logits = self.pyramid_fc_list0[used_branches](
                self.dropout_layer(local_feat))
            logits_list.append(local_logits)

            used_branches += 1

        return feat_list, logits_list

    def forward(self, x):
        feat = self.base(x)
        assert feat.shape[2] % self.num_stripes == 0
        feat_list, logits_list = self.pyramid_forward(feat)
        feat_out = paddle.concat(feat_list, axis=-1)
        return feat_out


================================================
FILE: ppdet/modeling/reid/resnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import math
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal

__all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]


class ConvBNLayer(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 dilation=1,
                 groups=1,
                 act=None,
                 lr_mult=1.0,
                 name=None,
                 data_format="NCHW"):
        super(ConvBNLayer, self).__init__()
        conv_stdv = filter_size * filter_size * num_filters
        self._conv = nn.Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            dilation=dilation,
            groups=groups,
            weight_attr=ParamAttr(
                learning_rate=lr_mult,
                initializer=Normal(0, math.sqrt(2. / conv_stdv))),
            bias_attr=False,
            data_format=data_format)

        self._batch_norm = nn.BatchNorm2D(num_filters)
        self.act = act

    def forward(self, inputs):
        y = self._conv(inputs)
        y = self._batch_norm(y)
        if self.act:
            y = getattr(F, self.act)(y)
        return y


class BottleneckBlock(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride,
                 shortcut=True,
                 name=None,
                 lr_mult=1.0,
                 dilation=1,
                 data_format="NCHW"):
        super(BottleneckBlock, self).__init__()
        self.conv0 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            dilation=dilation,
            act="relu",
            lr_mult=lr_mult,
            name=name + "_branch2a",
            data_format=data_format)
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            dilation=dilation,
            stride=stride,
            act="relu",
            lr_mult=lr_mult,
            name=name + "_branch2b",
            data_format=data_format)
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            dilation=dilation,
            act=None,
            lr_mult=lr_mult,
            name=name + "_branch2c",
            data_format=data_format)
        if not shortcut:
            self.short = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                dilation=dilation,
                stride=stride,
                lr_mult=lr_mult,
                name=name + "_branch1",
                data_format=data_format)
        self.shortcut = shortcut
        self._num_channels_out = num_filters * 4

    def forward(self, inputs):
        y = self.conv0(inputs)
        conv1 = self.conv1(y)
        conv2 = self.conv2(conv1)
        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)
        y = paddle.add(x=short, y=conv2)
        y = F.relu(y)
        return y


class BasicBlock(nn.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride,
                 shortcut=True,
                 name=None,
                 data_format="NCHW"):
        super(BasicBlock, self).__init__()
        self.stride = stride
        self.conv0 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act="relu",
            name=name + "_branch2a",
            data_format=data_format)
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            act=None,
            name=name + "_branch2b",
            data_format=data_format)
        if not shortcut:
            self.short = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters,
                filter_size=1,
                stride=stride,
                name=name + "_branch1",
                data_format=data_format)
        self.shortcut = shortcut

    def forward(self, inputs):
        y = self.conv0(inputs)
        conv1 = self.conv1(y)
        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)
        y = paddle.add(x=short, y=conv1)
        y = F.relu(y)
        return y


class ResNet(nn.Layer):
    def __init__(self,
                 layers=50,
                 lr_mult=1.0,
                 last_conv_stride=2,
                 last_conv_dilation=1):
        super(ResNet, self).__init__()
        self.layers = layers
        self.data_format = "NCHW"
        self.input_image_channel = 3
        supported_layers = [18, 34, 50, 101, 152]
        assert layers in supported_layers, \
            "supported layers are {} but input layer is {}".format(
                supported_layers, layers)
        if layers == 18:
            depth = [2, 2, 2, 2]
        elif layers == 34 or layers == 50:
            depth = [3, 4, 6, 3]
        elif layers == 101:
            depth = [3, 4, 23, 3]
        elif layers == 152:
            depth = [3, 8, 36, 3]
        num_channels = [64, 256, 512,
                        1024] if layers >= 50 else [64, 64, 128, 256]
        num_filters = [64, 128, 256, 512]
        self.conv = ConvBNLayer(
            num_channels=self.input_image_channel,
            num_filters=64,
            filter_size=7,
            stride=2,
            act="relu",
            lr_mult=lr_mult,
            name="conv1",
            data_format=self.data_format)
        self.pool2d_max = nn.MaxPool2D(
            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
        self.block_list = []
        if layers >= 50:
            for block in range(len(depth)):
                shortcut = False
                for i in range(depth[block]):
                    if layers in [101, 152] and block == 2:
                        if i == 0:
                            conv_name = "res" + str(block + 2) + "a"
                        else:
                            conv_name = "res" + str(block + 2) + "b" + str(i)
                    else:
                        conv_name = "res" + str(block + 2) + chr(97 + i)
                    if i != 0 or block == 0:
                        stride = 1
                    elif block == len(depth) - 1:
                        stride = last_conv_stride
                    else:
                        stride = 2
                    bottleneck_block = self.add_sublayer(
                        conv_name,
                        BottleneckBlock(
                            num_channels=num_channels[block]
                            if i == 0 else num_filters[block] * 4,
                            num_filters=num_filters[block],
                            stride=stride,
                            shortcut=shortcut,
                            name=conv_name,
                            lr_mult=lr_mult,
                            dilation=last_conv_dilation
                            if block == len(depth) - 1 else 1,
                            data_format=self.data_format))
                    self.block_list.append(bottleneck_block)
                    shortcut = True
        else:
            for block in range(len(depth)):
                shortcut = False
                for i in range(depth[block]):
                    conv_name = "res" + str(block + 2) + chr(97 + i)
                    basic_block = self.add_sublayer(
                        conv_name,
                        BasicBlock(
                            num_channels=num_channels[block]
                            if i == 0 else num_filters[block],
                            num_filters=num_filters[block],
                            stride=2 if i == 0 and block != 0 else 1,
                            shortcut=shortcut,
                            name=conv_name,
                            data_format=self.data_format))
                    self.block_list.append(basic_block)
                    shortcut = True

    def forward(self, inputs):
        y = self.conv(inputs)
        y = self.pool2d_max(y)
        for block in self.block_list:
            y = block(y)
        return y


def ResNet18(**args):
    model = ResNet(layers=18, **args)
    return model


def ResNet34(**args):
    model = ResNet(layers=34, **args)
    return model


def ResNet50(pretrained=None, **args):
    model = ResNet(layers=50, **args)
    if pretrained is not None:
        if not (os.path.isdir(pretrained) or
                os.path.exists(pretrained + '.pdparams')):
            raise ValueError("Model pretrain path {} does not "
                             "exists.".format(pretrained))
        param_state_dict = paddle.load(pretrained + '.pdparams')
        model.set_dict(param_state_dict)
    return model


def ResNet101(pretrained=None, **args):
    model = ResNet(layers=101, **args)
    if pretrained is not None:
        if not (os.path.isdir(pretrained) or
                os.path.exists(pretrained + '.pdparams')):
            raise ValueError("Model pretrain path {} does not "
                             "exists.".format(pretrained))
        param_state_dict = paddle.load(pretrained + '.pdparams')
        model.set_dict(param_state_dict)
    return model


def ResNet152(**args):
    model = ResNet(layers=152, **args)
    return model


================================================
FILE: ppdet/modeling/reid/resnet_embedding.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import os
import paddle
import paddle.nn.functional as F
from paddle import nn
from .resnet import ResNet50, ResNet101
from ppdet.core.workspace import register

__all__ = ['ResNetEmbedding']


@register
class ResNetEmbedding(nn.Layer):
    in_planes = 2048
    def __init__(self, model_name='ResNet50', last_stride=1):
        super(ResNetEmbedding, self).__init__()
        assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name)
        self.base = eval(model_name)(last_conv_stride=last_stride)
        self.gap = nn.AdaptiveAvgPool2D(output_size=1)
        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
        self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False)

    def forward(self, x):
        base_out = self.base(x)
        global_feat = self.gap(base_out)
        global_feat = self.flatten(global_feat)
        global_feat = self.bn(global_feat)
        return global_feat


================================================
FILE: ppdet/modeling/shape_spec.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

# The code is based on:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py

from collections import namedtuple


class ShapeSpec(
        namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
    def __new__(cls, channels=None, height=None, width=None, stride=None):
        return super(ShapeSpec, cls).__new__(cls, channels, height, width,
                                             stride)


================================================
FILE: ppdet/modeling/ssod/__init__.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

from . import utils
from . import losses

from .utils import *
from .losses import *


================================================
FILE: ppdet/modeling/ssod/losses.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ppdet.modeling.losses.iou_loss import GIoULoss
from .utils import QFLv2

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'SSODFCOSLoss',
    'SSODPPYOLOELoss',
]


@register
class SSODFCOSLoss(nn.Layer):
    def __init__(self, loss_weight=1.0):
        super(SSODFCOSLoss, self).__init__()
        self.loss_weight = loss_weight

    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
        # for semi-det distill
        student_logits, student_deltas, student_quality = student_head_outs
        teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs
        nc = student_logits[0].shape[1]

        student_logits = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
                for _ in student_logits
            ],
            axis=0)
        teacher_logits = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
                for _ in teacher_logits
            ],
            axis=0)

        student_deltas = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
                for _ in student_deltas
            ],
            axis=0)
        teacher_deltas = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
                for _ in teacher_deltas
            ],
            axis=0)

        student_quality = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
                for _ in student_quality
            ],
            axis=0)
        teacher_quality = paddle.concat(
            [
                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
                for _ in teacher_quality
            ],
            axis=0)

        ratio = train_cfg.get('ratio', 0.01)
        with paddle.no_grad():
            # Region Selection
            count_num = int(teacher_logits.shape[0] * ratio)
            teacher_probs = F.sigmoid(teacher_logits)
            max_vals = paddle.max(teacher_probs, 1)
            sorted_vals, sorted_inds = paddle.topk(max_vals,
                                                   teacher_logits.shape[0])
            mask = paddle.zeros_like(max_vals)
            mask[sorted_inds[:count_num]] = 1.
            fg_num = sorted_vals[:count_num].sum()
            b_mask = mask > 0

        # distill_loss_cls
        loss_logits = QFLv2(
            F.sigmoid(student_logits),
            teacher_probs,
            weight=mask,
            reduction="sum") / fg_num

        # distill_loss_box
        inputs = paddle.concat(
            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
            axis=-1)
        targets = paddle.concat(
            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
            axis=-1)
        iou_loss = GIoULoss(reduction='mean')
        loss_deltas = iou_loss(inputs, targets)

        # distill_loss_quality
        loss_quality = F.binary_cross_entropy(
            F.sigmoid(student_quality[b_mask]),
            F.sigmoid(teacher_quality[b_mask]),
            reduction='mean')

        return {
            "distill_loss_cls": loss_logits,
            "distill_loss_box": loss_deltas,
            "distill_loss_quality": loss_quality,
            "fg_sum": fg_num,
        }


@register
class SSODPPYOLOELoss(nn.Layer):
    def __init__(self, loss_weight=1.0):
        super(SSODPPYOLOELoss, self).__init__()
        self.loss_weight = loss_weight

    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
        # for semi-det distill
        # student_probs: already sigmoid
        student_probs, student_deltas, student_dfl = student_head_outs
        teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs
        bs, l, nc = student_probs.shape[:]  # bs, l, num_classes
        bs, l, _, reg_ch = student_dfl.shape[:]  # bs, l, 4, reg_ch
        student_probs = student_probs.reshape([-1, nc])
        teacher_probs = teacher_probs.reshape([-1, nc])
        student_deltas = student_deltas.reshape([-1, 4])
        teacher_deltas = teacher_deltas.reshape([-1, 4])
        student_dfl = student_dfl.reshape([-1, 4, reg_ch])
        teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch])

        ratio = train_cfg.get('ratio', 0.01)

        # for contrast loss
        curr_iter = train_cfg['curr_iter']
        st_iter = train_cfg['st_iter']
        if curr_iter == st_iter + 1:
            # start semi-det training
            self.queue_ptr = 0
            self.queue_size = int(bs * l * ratio)
            self.queue_feats = paddle.zeros([self.queue_size, nc])
            self.queue_probs = paddle.zeros([self.queue_size, nc])
        contrast_loss_cfg = train_cfg['contrast_loss']
        temperature = contrast_loss_cfg.get('temperature', 0.2)
        alpha = contrast_loss_cfg.get('alpha', 0.9)
        smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter

        with paddle.no_grad():
            # Region Selection
            count_num = int(teacher_probs.shape[0] * ratio)
            max_vals = paddle.max(teacher_probs, 1)
            sorted_vals, sorted_inds = paddle.topk(max_vals,
                                                   teacher_probs.shape[0])
            mask = paddle.zeros_like(max_vals)
            mask[sorted_inds[:count_num]] = 1.
            fg_num = sorted_vals[:count_num].sum()
            b_mask = mask > 0.

            # for contrast loss
            probs = teacher_probs[b_mask].detach()
            if curr_iter > smooth_iter:  # memory-smoothing
                A = paddle.exp(
                    paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) /
                    temperature)
                A = A / A.sum(1, keepdim=True)
                probs = alpha * probs + (1 - alpha) * paddle.mm(
                    A, self.queue_probs)
            n = student_probs[b_mask].shape[0]
            # update memory bank
            self.queue_feats[self.queue_ptr:self.queue_ptr +
                             n, :] = teacher_probs[b_mask].detach()
            self.queue_probs[self.queue_ptr:self.queue_ptr +
                             n, :] = teacher_probs[b_mask].detach()
            self.queue_ptr = (self.queue_ptr + n) % self.queue_size

        # embedding similarity
        sim = paddle.exp(
            paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2)
        sim_probs = sim / sim.sum(1, keepdim=True)
        # pseudo-label graph with self-loop
        Q = paddle.mm(probs, probs.t())
        Q.fill_diagonal_(1)
        pos_mask = (Q >= 0.5).astype('float32')
        Q = Q * pos_mask
        Q = Q / Q.sum(1, keepdim=True)
        # contrastive loss
        loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1)
        loss_contrast = loss_contrast.mean()

        # distill_loss_cls
        loss_cls = QFLv2(
            student_probs, teacher_probs, weight=mask, reduction="sum") / fg_num

        # distill_loss_iou
        inputs = paddle.concat(
            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
            -1)
        targets = paddle.concat(
            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
            -1)
        iou_loss = GIoULoss(reduction='mean')
        loss_iou = iou_loss(inputs, targets)

        # distill_loss_dfl
        loss_dfl = F.cross_entropy(
            student_dfl[b_mask].reshape([-1, reg_ch]),
            teacher_dfl[b_mask].reshape([-1, reg_ch]),
            soft_label=True,
            reduction='mean')

        return {
            "distill_loss_cls": loss_cls,
            "distill_loss_iou": loss_iou,
            "distill_loss_dfl": loss_dfl,
            "distill_loss_contrast": loss_contrast,
            "fg_sum": fg_num,
        }


================================================
FILE: ppdet/modeling/ssod/utils.py
================================================
#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn.functional as F


def align_weak_strong_shape(data_weak, data_strong):
    max_shape_x = max(data_strong['image'].shape[2],
                      data_weak['image'].shape[2])
    max_shape_y = max(data_strong['image'].shape[3],
                      data_weak['image'].shape[3])

    scale_x_s = max_shape_x / data_strong['image'].shape[2]
    scale_y_s = max_shape_y / data_strong['image'].shape[3]
    scale_x_w = max_shape_x / data_weak['image'].shape[2]
    scale_y_w = max_shape_y / data_weak['image'].shape[3]
    target_size = [max_shape_x, max_shape_y]

    if scale_x_s != 1 or scale_y_s != 1:
        data_strong['image'] = F.interpolate(
            data_strong['image'],
            size=target_size,
            mode='bilinear',
            align_corners=False)
        if 'gt_bbox' in data_strong:
            gt_bboxes = data_strong['gt_bbox'].numpy()
            for i in range(len(gt_bboxes)):
                if len(gt_bboxes[i]) > 0:
                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s
                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s
            data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes)

    if scale_x_w != 1 or scale_y_w != 1:
        data_weak['image'] = F.interpolate(
            data_weak['image'],
            size=target_size,
            mode='bilinear',
            align_corners=False)
        if 'gt_bbox' in data_weak:
            gt_bboxes = data_weak['gt_bbox'].numpy()
            for i in range(len(gt_bboxes)):
                if len(gt_bboxes[i]) > 0:
                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w
                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w
            data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes)
    return data_weak, data_strong


def QFLv2(pred_sigmoid,
          teacher_sigmoid,
          weight=None,
          beta=2.0,
          reduction='mean'):
    pt = pred_sigmoid
    zerolabel = paddle.zeros_like(pt)
    loss = F.binary_cross_entropy(
        pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta)
    pos = weight > 0

    pt = teacher_sigmoid[pos] - pred_sigmoid[pos]
    loss[pos] = F.binary_cross_entropy(
        pred_sigmoid[pos], teacher_sigmoid[pos],
        reduction='none') * pt.pow(beta)

    valid = weight >= 0
    if reduction == "mean":
        loss = loss[valid].mean()
    elif reduction == "sum":
        loss = loss[valid].sum()
    return loss


def filter_invalid(bbox, label=None, score=None, thr=0.0, min_size=0):
    if score.numel() > 0:
        soft_score = score.max(-1)
        valid = soft_score >= thr
        bbox = bbox[valid]

        if label is not None:
            label = label[valid]
        score = score[valid]
    if min_size is not None and bbox.shape[0] > 0:
        bw = bbox[:, 2]
        bh = bbox[:, 3]
        valid = (bw > min_size) & (bh > min_size)
        bbox = bbox[valid]

        if label is not None:
            label = label[valid]
            score = score[valid]

    return bbox, label, score


================================================
FILE: ppdet/modeling/tests/__init__.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppdet/modeling/tests/test_architectures.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import unittest
import ppdet


class TestFasterRCNN(unittest.TestCase):
    def setUp(self):
        self.set_config()

    def set_config(self):
        self.cfg_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml'

    def test_trainer(self):
        # Trainer __init__ will build model and DataLoader
        # 'train' and 'eval' mode include dataset loading
        # use 'test' mode to simplify tests
        cfg = ppdet.core.workspace.load_config(self.cfg_file)
        trainer = ppdet.engine.Trainer(cfg, mode='test')


class TestMaskRCNN(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml'


class TestCascadeRCNN(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml'


class TestYolov3(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml'


class TestSSD(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/ssd/ssd_vgg16_300_240e_voc.yml'


class TestGFL(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/gfl/gfl_r50_fpn_1x_coco.yml'


class TestPicoDet(TestFasterRCNN):
    def set_config(self):
        self.cfg_file = 'configs/picodet/picodet_s_320_coco_lcnet.yml'


if __name__ == '__main__':
    unittest.main()


================================================
FILE: ppdet/modeling/tests/test_base.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import unittest

import contextlib

import paddle
from paddle.static import Program


class LayerTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.seed = 111

    @classmethod
    def tearDownClass(cls):
        pass

    def _get_place(self, force_to_use_cpu=False):
        # this option for ops that only have cpu kernel
        if force_to_use_cpu:
            return 'cpu'
        else:
            return paddle.device.get_device()

    @contextlib.contextmanager
    def static_graph(self):
        paddle.enable_static()
        scope = paddle.static.Scope()
        program = Program()
        with paddle.static.scope_guard(scope):
            with paddle.static.program_guard(program):
                paddle.seed(self.seed)
                paddle.framework.random._manual_program_seed(self.seed)
                yield

    def get_static_graph_result(self,
                                feed,
                                fetch_list,
                                with_lod=False,
                                force_to_use_cpu=False):
        exe = paddle.static.Executor(self._get_place(force_to_use_cpu))
        exe.run(paddle.static.default_startup_program())
        return exe.run(paddle.static.default_main_program(),
                       feed=feed,
                       fetch_list=fetch_list,
                       return_numpy=(not with_lod))

    @contextlib.contextmanager
    def dynamic_graph(self, force_to_use_cpu=False):
        paddle.disable_static()
        place = self._get_place(force_to_use_cpu=force_to_use_cpu)
        paddle.device.set_device(place)
        paddle.seed(self.seed)
        paddle.framework.random._manual_program_seed(self.seed)
        yield


================================================
FILE: ppdet/modeling/tests/test_mstest.py
================================================
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import unittest
from ppdet.core.workspace import load_config
from ppdet.engine import Trainer


class TestMultiScaleInference(unittest.TestCase):
    def setUp(self):
        self.set_config()

    def set_config(self):
        self.mstest_cfg_file = 'configs/faster_rcnn/faster_rcnn_r34_fpn_multiscaletest_1x_coco.yml'

    # test evaluation with multi scale test
    def test_eval_mstest(self):
        cfg = load_config(self.mstest_cfg_file)
        trainer = Trainer(cfg, mode='eval')

        cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams'
        trainer.load_weights(cfg.weights)

        trainer.evaluate()

    # test inference with multi scale test
    def test_infer_mstest(self):
        cfg = load_config(self.mstest_cfg_file)
        trainer = Trainer(cfg, mode='test')

        cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams'
        trainer.load_weights(cfg.weights)
        tests_img_root = os.path.join(os.path.dirname(__file__), 'imgs')

        # input images to predict
        imgs = [
            'coco2017_val2017_000000000139.jpg',
            'coco2017_val2017_000000000724.jpg'
        ]
        imgs = [os.path.join(tests_img_root, img) for img in imgs]
        trainer.predict(
            imgs, draw_threshold=0.5, output_dir='output', save_results=False)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: ppdet/modeling/tests/test_ops.py
================================================
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import os, sys
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

import unittest
import numpy as np

import paddle

import ppdet.modeling.ops as ops
from ppdet.modeling.tests.test_base import LayerTest


def make_rois(h, w, rois_num, output_size):
    rois = np.zeros((0, 4)).astype('float32')
    for roi_num in rois_num:
        roi = np.zeros((roi_num, 4)).astype('float32')
        roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num)
        roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num)
        roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h)
        roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w)
        rois = np.vstack((rois, roi))
    return rois


def softmax(x):
    # clip to shiftx, otherwise, when calc loss with
    # log(exp(shiftx)), may get log(0)=INF
    shiftx = (x - np.max(x)).clip(-64.)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)


class TestROIAlign(LayerTest):
    def test_roi_align(self):
        b, c, h, w = 2, 12, 20, 20
        inputs_np = np.random.rand(b, c, h, w).astype('float32')
        rois_num = [4, 6]
        output_size = (7, 7)
        rois_np = make_rois(h, w, rois_num, output_size)
        rois_num_np = np.array(rois_num).astype('int32')
        with self.static_graph():
            inputs = paddle.static.data(
                name='inputs', shape=[b, c, h, w], dtype='float32')
            rois = paddle.static.data(
                name='rois', shape=[10, 4], dtype='float32')
            rois_num = paddle.static.data(
                name='rois_num', shape=[None], dtype='int32')

            output = paddle.vision.ops.roi_align(
                x=inputs,
                boxes=rois,
                boxes_num=rois_num,
                output_size=output_size)
            output_np, = self.get_static_graph_result(
                feed={
                    'inputs': inputs_np,
                    'rois': rois_np,
                    'rois_num': rois_num_np
                },
                fetch_list=output,
                with_lod=False)

        with self.dynamic_graph():
            inputs_dy = paddle.to_tensor(inputs_np)
            rois_dy = paddle.to_tensor(rois_np)
            rois_num_dy = paddle.to_tensor(rois_num_np)

            output_dy = paddle.vision.ops.roi_align(
                x=inputs_dy,
                boxes=rois_dy,
                boxes_num=rois_num_dy,
                output_size=output_size)
            output_dy_np = output_dy.numpy()

        self.assertTrue(np.array_equal(output_np, output_dy_np))

    def test_roi_align_error(self):
        with self.static_graph():
            inputs = paddle.static.data(
                name='inputs', shape=[2, 12, 20, 20], dtype='float32')
            rois = paddle.static.data(
                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)
            self.assertRaises(
                TypeError,
                paddle.vision.ops.roi_align,
                input=inputs,
                rois=rois,
                output_size=(7, 7))

        paddle.disable_static()


class TestROIPool(LayerTest):
    def test_roi_pool(self):
        b, c, h, w = 2, 12, 20, 20
        inputs_np = np.random.rand(b, c, h, w).astype('float32')
        rois_num = [4, 6]
        output_size = (7, 7)
        rois_np = make_rois(h, w, rois_num, output_size)
        rois_num_np = np.array(rois_num).astype('int32')
        with self.static_graph():
            inputs = paddle.static.data(
                name='inputs', shape=[b, c, h, w], dtype='float32')
            rois = paddle.static.data(
                name='rois', shape=[10, 4], dtype='float32')
            rois_num = paddle.static.data(
                name='rois_num', shape=[None], dtype='int32')

            output = paddle.vision.ops.roi_pool(
                x=inputs,
                boxes=rois,
                boxes_num=rois_num,
                output_size=output_size)
            output_np, = self.get_static_graph_result(
                feed={
                    'inputs': inputs_np,
                    'rois': rois_np,
                    'rois_num': rois_num_np
                },
                fetch_list=[output],
                with_lod=False)

        with self.dynamic_graph():
            inputs_dy = paddle.to_tensor(inputs_np)
            rois_dy = paddle.to_tensor(rois_np)
            rois_num_dy = paddle.to_tensor(rois_num_np)

            output_dy = paddle.vision.ops.roi_pool(
                x=inputs_dy,
                boxes=rois_dy,
                boxes_num=rois_num_dy,
                output_size=output_size)
            output_dy_np = output_dy.numpy()

        self.assertTrue(np.array_equal(output_np, output_dy_np))

    def test_roi_pool_error(self):
        with self.static_graph():
            inputs = paddle.static.data(
                name='inputs', shape=[2, 12, 20, 20], dtype='float32')
            rois = paddle.static.data(
                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)
            self.assertRaises(
                TypeError,
                paddle.vision.ops.roi_pool,
                input=inputs,
                rois=rois,
                output_size=(7, 7))

        paddle.disable_static()


class TestPriorBox(LayerTest):
    def test_prior_box(self):
        input_np = np.random.rand(2, 10, 32, 32).astype('float32')
        image_np = np.random.rand(2, 10, 40, 40).astype('float32')
        min_sizes = [2, 4]
        with self.static_graph():
            input = paddle.static.data(
                name='input', shape=[2, 10, 32, 32], dtype='float32')
            image = paddle.static.data(
                name='image', shape=[2, 10, 40, 40], dtype='float32')

            box, var = ops.prior_box(
                input=input,
                image=image,
                min_sizes=min_sizes,
                clip=True,
                flip=True)
            box_np, var_np = self.get_static_graph_result(
                feed={
                    'input': input_np,
                    'image': image_np,
                },
                fetch_list=[box, var],
                with_lod=False)

        with self.dynamic_graph():
            inputs_dy = paddle.to_tensor(input_np)
            image_dy = paddle.to_tensor(image_np)

            box_dy, var_dy = ops.prior_box(
                input=inputs_dy,
                image=image_dy,
                min_sizes=min_sizes,
                clip=True,
                flip=True)
            box_dy_np = box_dy.numpy()
            var_dy_np = var_dy.numpy()

        self.assertTrue(np.array_equal(box_np, box_dy_np))
        self.assertTrue(np.array_equal(var_np, var_dy_np))

    def test_prior_box_error(self):
        with self.static_graph():
            input = paddle.static.data(
                name='input', shape=[2, 10, 32, 32], dtype='int32')
            image = paddle.static.data(
                name='image', shape=[2, 10, 40, 40], dtype='int32')
            self.assertRaises(
                TypeError,
                ops.prior_box,
                input=input,
                image=image,
                min_sizes=[2, 4],
                clip=True,
                flip=True)

        paddle.disable_static()


class TestMulticlassNms(LayerTest):
    def test_multiclass_nms(self):
        boxes_np = np.random.rand(10, 81, 4).astype('float32')
        scores_np = np.random.rand(10, 81).astype('float32')
        rois_num_np = np.array([2, 8]).astype('int32')
        with self.static_graph():
            boxes = paddle.static.data(
                name='bboxes',
                shape=[None, 81, 4],
                dtype='float32',
                lod_level=1)
            scores = paddle.static.data(
                name='scores', shape=[None, 81], dtype='float32', lod_level=1)
            rois_num = paddle.static.data(
                name='rois_num', shape=[None], dtype='int32')

            output = ops.multiclass_nms(
                bboxes=boxes,
                scores=scores,
                background_label=0,
                score_threshold=0.5,
                nms_top_k=400,
                nms_threshold=0.3,
                keep_top_k=200,
                normalized=False,
                return_index=True,
                rois_num=rois_num)
            out_np, index_np, nms_rois_num_np = self.get_static_graph_result(
                feed={
                    'bboxes': boxes_np,
                    'scores': scores_np,
                    'rois_num': rois_num_np
                },
                fetch_list=output,
                with_lod=True)
            out_np = np.array(out_np)
            index_np = np.array(index_np)
            nms_rois_num_np = np.array(nms_rois_num_np)

        with self.dynamic_graph():
            boxes_dy = paddle.to_tensor(boxes_np)
            scores_dy = paddle.to_tensor(scores_np)
            rois_num_dy = paddle.to_tensor(rois_num_np)

            out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms(
                bboxes=boxes_dy,
                scores=scores_dy,
                background_label=0,
                score_threshold=0.5,
                nms_top_k=400,
                nms_threshold=0.3,
                keep_top_k=200,
                normalized=False,
                return_index=True,
                rois_num=rois_num_dy)
            out_dy_np = out_dy.numpy()
            index_dy_np = index_dy.numpy()
            nms_rois_num_dy_np = nms_rois_num_dy.numpy()

        self.assertTrue(np.array_equal(out_np, out_dy_np))
        self.assertTrue(np.array_equal(index_np, index_dy_np))
        self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np))

    def test_multiclass_nms_error(self):
        with self.static_graph():
            boxes = paddle.static.data(
                name='bboxes', shape=[81, 4], dtype='float32', lod_level=1)
            scores = paddle.static.data(
                name='scores', shape=[81], dtype='float32', lod_level=1)
            rois_num = paddle.static.data(
                name='rois_num', shape=[40, 41], dtype='int32')
            self.assertRaises(
                TypeError,
                ops.multiclass_nms,
                boxes=boxes,
                scores=scores,
                background_label=0,
                score_threshold=0.5,
                nms_top_k=400,
                nms_threshold=0.3,
                keep_top_k=200,
                normalized=False,
                return_index=True,
                rois_num=rois_num)


class TestMatrixNMS(LayerTest):
    def test_matrix_nms(self):
        N, M, C = 7, 1200, 21
        BOX_SIZE = 4
        nms_top_k = 400
        keep_top_k = 200
        score_threshold = 0.01
        post_threshold = 0.

        scores_np = np.random.random((N * M, C)).astype('float32')
        scores_np = np.apply_along_axis(softmax, 1, scores_np)
        scores_np = np.reshape(scores_np, (N, M, C))
        scores_np = np.transpose(scores_np, (0, 2, 1))

        boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32')
        boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5
        boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5

        with self.static_graph():
            boxes = paddle.static.data(
                name='boxes', shape=[N, M, BOX_SIZE], dtype='float32')
            scores = paddle.static.data(
                name='scores', shape=[N, C, M], dtype='float32')
            out, index, _ = ops.matrix_nms(
                bboxes=boxes,
                scores=scores,
                score_threshold=score_threshold,
                post_threshold=post_threshold,
                nms_top_k=nms_top_k,
                keep_top_k=keep_top_k,
                return_index=True)
            out_np, index_np = self.get_static_graph_result(
                feed={'boxes': boxes_np,
                      'scores': scores_np},
                fetch_list=[out, index],
                with_lod=True)

        with self.dynamic_graph():
            boxes_dy = paddle.to_tensor(boxes_np)
            scores_dy = paddle.to_tensor(scores_np)

            out_dy, index_dy, _ = ops.matrix_nms(
                bboxes=boxes_dy,
                scores=scores_dy,
                score_threshold=score_threshold,
                post_threshold=post_threshold,
                nms_top_k=nms_top_k,
                keep_top_k=keep_top_k,
                return_index=True)
            out_dy_np = out_dy.numpy()
            index_dy_np = index_dy.numpy()

        self.assertTrue(np.array_equal(out_np, out_dy_np))
        self.assertTrue(np.array_equal(index_np, index_dy_np))

    def test_matrix_nms_error(self):
        with self.static_graph():
            bboxes = paddle.static.data(
                name='bboxes', shape=[7, 1200, 4], dtype='float32')
            scores = paddle.static.data(
                name='data_error', shape=[7, 21, 1200], dtype='int32')
            self.assertRaises(
                TypeError,
                ops.matrix_nms,
                bboxes=bboxes,
                scores=scores,
                score_threshold=0.01,
                post_threshold=0.,
                nms_top_k=400,
                keep_top_k=200,
                return_index=True)

        paddle.disable_static()


class TestBoxCoder(LayerTest):
    def test_box_coder(self):

        prior_box_np = np.random.random((81, 4)).astype('float32')
        prior_box_var_np = np.random.random((81, 4)).astype('float32')
        target_box_np = np.random.random((20, 81, 4)).astype('float32')

        # static
        with self.static_graph():
            prior_box = paddle.static.data(
                name='prior_box', shape=[81, 4], dtype='float32')
            prior_box_var = paddle.static.data(
                name='prior_box_var', shape=[81, 4], dtype='float32')
            target_box = paddle.static.data(
                name='target_box', shape=[20, 81, 4], dtype='float32')

            boxes = ops.box_coder(
                prior_box=prior_box,
                prior_box_var=prior_box_var,
                target_box=target_box,
                code_type="decode_center_size",
                box_normalized=False)

            boxes_np, = self.get_static_graph_result(
                feed={
                    'prior_box': prior_box_np,
                    'prior_box_var': prior_box_var_np,
                    'target_box': target_box_np,
                },
                fetch_list=[boxes],
                with_lod=False)

        # dygraph
        with self.dynamic_graph():
            prior_box_dy = paddle.to_tensor(prior_box_np)
            prior_box_var_dy = paddle.to_tensor(prior_box_var_np)
            target_box_dy = paddle.to_tensor(target_box_np)

            boxes_dy = ops.box_coder(
                prior_box=prior_box_dy,
                prior_box_var=prior_box_var_dy,
                target_box=target_box_dy,
                code_type="decode_center_size",
                box_normalized=False)

            boxes_dy_np = boxes_dy.numpy()

            self.assertTrue(np.array_equal(boxes_np, boxes_dy_np))

    def test_box_coder_error(self):
        with self.static_graph():
            prior_box = paddle.static.data(
                name='prior_box', shape=[81, 4], dtype='int32')
            prior_box_var = paddle.static.data(
                name='prior_box_var', shape=[81, 4], dtype='float32')
            target_box = paddle.static.data(
                name='target_box', shape=[20, 81, 4], dtype='float32')

            self.assertRaises(TypeError, ops.box_coder, prior_box,
                              prior_box_var, target_box)

        paddle.disable_static()


if __name__ == '__main__':
    unittest.main()


================================================
FILE: ppdet/modeling/tests/test_yolov3_loss.py
================================================
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division

import unittest

import paddle
import paddle.nn.functional as F
# add python path of PaddleDetection to sys.path
import os
import sys
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.modeling.losses import YOLOv3Loss
from ppdet.data.transform.op_helper import jaccard_overlap
from ppdet.modeling.bbox_utils import iou_similarity
import numpy as np
np.random.seed(0)


def _split_output(output, an_num, num_classes):
    """
    Split output feature map to x, y, w, h, objectness, classification
    along channel dimension
    """
    x = paddle.strided_slice(
        output,
        axes=[1],
        starts=[0],
        ends=[output.shape[1]],
        strides=[5 + num_classes])
    y = paddle.strided_slice(
        output,
        axes=[1],
        starts=[1],
        ends=[output.shape[1]],
        strides=[5 + num_classes])
    w = paddle.strided_slice(
        output,
        axes=[1],
        starts=[2],
        ends=[output.shape[1]],
        strides=[5 + num_classes])
    h = paddle.strided_slice(
        output,
        axes=[1],
        starts=[3],
        ends=[output.shape[1]],
        strides=[5 + num_classes])
    obj = paddle.strided_slice(
        output,
        axes=[1],
        starts=[4],
        ends=[output.shape[1]],
        strides=[5 + num_classes])
    clss = []
    stride = output.shape[1] // an_num
    for m in range(an_num):
        clss.append(
            paddle.slice(
                output,
                axes=[1],
                starts=[stride * m + 5],
                ends=[stride * m + 5 + num_classes]))
    cls = paddle.transpose(paddle.stack(clss, axis=1), perm=[0, 1, 3, 4, 2])
    return (x, y, w, h, obj, cls)


def _split_target(target):
    """
    split target to x, y, w, h, objectness, classification
    along dimension 2
    target is in shape [N, an_num, 6 + class_num, H, W]
    """
    tx = target[:, :, 0, :, :]
    ty = target[:, :, 1, :, :]
    tw = target[:, :, 2, :, :]
    th = target[:, :, 3, :, :]
    tscale = target[:, :, 4, :, :]
    tobj = target[:, :, 5, :, :]
    tcls = paddle.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2])
    tcls.stop_gradient = True
    return (tx, ty, tw, th, tscale, tobj, tcls)


def _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes,
                   downsample, ignore_thresh, scale_x_y):
    # A prediction bbox overlap any gt_bbox over ignore_thresh, 
    # objectness loss will be ignored, process as follows:
    # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here
    # NOTE: img_size is set as 1.0 to get noramlized pred bbox
    bbox, prob = paddle.vision.ops.yolo_box(
        x=output,
        img_size=paddle.ones(
            shape=[batch_size, 2], dtype="int32"),
        anchors=anchors,
        class_num=num_classes,
        conf_thresh=0.,
        downsample_ratio=downsample,
        clip_bbox=False,
        scale_x_y=scale_x_y)
    # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox
    #    and gt bbox in each sample
    if batch_size > 1:
        preds = paddle.split(bbox, batch_size, axis=0)
        gts = paddle.split(gt_box, batch_size, axis=0)
    else:
        preds = [bbox]
        gts = [gt_box]
        probs = [prob]
    ious = []
    for pred, gt in zip(preds, gts):

        def box_xywh2xyxy(box):
            x = box[:, 0]
            y = box[:, 1]
            w = box[:, 2]
            h = box[:, 3]
            return paddle.stack(
                [
                    x - w / 2.,
                    y - h / 2.,
                    x + w / 2.,
                    y + h / 2.,
                ], axis=1)

        pred = paddle.squeeze(pred, axis=[0])
        gt = box_xywh2xyxy(paddle.squeeze(gt, axis=[0]))
        ious.append(iou_similarity(pred, gt))
    iou = paddle.stack(ious, axis=0)
    # 3. Get iou_mask by IoU between gt bbox and prediction bbox,
    #    Get obj_mask by tobj(holds gt_score), calculate objectness loss
    max_iou = paddle.max(iou, axis=-1)
    iou_mask = paddle.cast(max_iou <= ignore_thresh, dtype="float32")
    output_shape = output.shape
    an_num = len(anchors) // 2
    iou_mask = paddle.reshape(iou_mask, (-1, an_num, output_shape[2],
                                         output_shape[3]))
    iou_mask.stop_gradient = True
    # NOTE: tobj holds gt_score, obj_mask holds object existence mask
    obj_mask = paddle.cast(tobj > 0., dtype="float32")
    obj_mask.stop_gradient = True
    # For positive objectness grids, objectness loss should be calculated
    # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0
    obj_sigmoid = F.sigmoid(obj)
    loss_obj = F.binary_cross_entropy(obj_sigmoid, obj_mask, reduction='none')
    loss_obj_pos = paddle.sum(loss_obj * tobj, axis=[1, 2, 3])
    loss_obj_neg = paddle.sum(loss_obj * (1.0 - obj_mask) * iou_mask,
                              axis=[1, 2, 3])
    return loss_obj_pos, loss_obj_neg


def fine_grained_loss(output,
                      target,
                      gt_box,
                      batch_size,
                      num_classes,
                      anchors,
                      ignore_thresh,
                      downsample,
                      scale_x_y=1.,
                      eps=1e-10):
    an_num = len(anchors) // 2
    x, y, w, h, obj, cls = _split_output(output, an_num, num_classes)
    tx, ty, tw, th, tscale, tobj, tcls = _split_target(target)

    tscale_tobj = tscale * tobj

    scale_x_y = scale_x_y

    if (abs(scale_x_y - 1.0) < eps):
        x = F.sigmoid(x)
        y = F.sigmoid(y)
        loss_x = F.binary_cross_entropy(x, tx, reduction='none') * tscale_tobj
        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])
        loss_y = F.binary_cross_entropy(y, ty, reduction='none') * tscale_tobj
        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])
    else:
        dx = scale_x_y * F.sigmoid(x) - 0.5 * (scale_x_y - 1.0)
        dy = scale_x_y * F.sigmoid(y) - 0.5 * (scale_x_y - 1.0)
        loss_x = paddle.abs(dx - tx) * tscale_tobj
        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])
        loss_y = paddle.abs(dy - ty) * tscale_tobj
        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])

    # NOTE: we refined loss function of (w, h) as L1Loss
    loss_w = paddle.abs(w - tw) * tscale_tobj
    loss_w = paddle.sum(loss_w, axis=[1, 2, 3])
    loss_h = paddle.abs(h - th) * tscale_tobj
    loss_h = paddle.sum(loss_h, axis=[1, 2, 3])

    loss_obj_pos, loss_obj_neg = _calc_obj_loss(
        output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample,
        ignore_thresh, scale_x_y)

    cls = F.sigmoid(cls)
    loss_cls = F.binary_cross_entropy(cls, tcls, reduction='none')
    tobj = paddle.unsqueeze(tobj, axis=-1)

    loss_cls = paddle.multiply(loss_cls, tobj)
    loss_cls = paddle.sum(loss_cls, axis=[1, 2, 3, 4])

    loss_xys = paddle.mean(loss_x + loss_y)
    loss_whs = paddle.mean(loss_w + loss_h)
    loss_objs = paddle.mean(loss_obj_pos + loss_obj_neg)
    loss_clss = paddle.mean(loss_cls)

    losses_all = {
        "loss_xy": paddle.sum(loss_xys),
        "loss_wh": paddle.sum(loss_whs),
        "loss_loc": paddle.sum(loss_xys) + paddle.sum(loss_whs),
        "loss_obj": paddle.sum(loss_objs),
        "loss_cls": paddle.sum(loss_clss),
    }
    return losses_all, x, y, tx, ty


def gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size,
                  stride):
    grid_h, grid_w = size
    h, w = grid_h * stride, grid_w * stride
    an_hw = np.array(anchors) / np.array([[w, h]])
    target = np.zeros(
        (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32)
    for b in range(gt_bbox.shape[0]):
        gx, gy, gw, gh = gt_bbox[b, :]
        cls = gt_class[b]
        score = gt_score[b]
        if gw <= 0. or gh <= 0. or score <= 0.:
            continue

        # find best match anchor index
        best_iou = 0.
        best_idx = -1
        for an_idx in range(an_hw.shape[0]):
            iou = jaccard_overlap([0., 0., gw, gh],
                                  [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
            if iou > best_iou:
                best_iou = iou
                best_idx = an_idx

        gi = int(gx * grid_w)
        gj = int(gy * grid_h)

        # gtbox should be regresed in this layes if best match 
        # anchor index in anchor mask of this layer
        if best_idx in mask:
            best_n = mask.index(best_idx)

            # x, y, w, h, scale
            target[best_n, 0, gj, gi] = gx * grid_w - gi
            target[best_n, 1, gj, gi] = gy * grid_h - gj
            target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0])
            target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1])
            target[best_n, 4, gj, gi] = 2.0 - gw * gh

            # objectness record gt_score
            # if target[best_n, 5, gj, gi] > 0:
            #     print('find 1 duplicate')
            target[best_n, 5, gj, gi] = score

            # classification
            target[best_n, 6 + cls, gj, gi] = 1.

    return target


class TestYolov3LossOp(unittest.TestCase):
    def setUp(self):
        self.initTestCase()
        x = np.random.uniform(0, 1, self.x_shape).astype('float64')
        gtbox = np.random.random(size=self.gtbox_shape).astype('float64')
        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
        gtbox = gtbox * gtmask[:, :, np.newaxis]
        gtlabel = gtlabel * gtmask

        gtscore = np.ones(self.gtbox_shape[:2]).astype('float64')
        if self.gtscore:
            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64')

        target = []
        for box, label, score in zip(gtbox, gtlabel, gtscore):
            target.append(
                gt2yolotarget(box, label, score, self.anchors, self.anchor_mask,
                              self.class_num, (self.h, self.w
                                               ), self.downsample_ratio))

        self.target = np.array(target).astype('float64')

        self.mask_anchors = []
        for i in self.anchor_mask:
            self.mask_anchors.extend(self.anchors[i])
        self.x = x
        self.gtbox = gtbox
        self.gtlabel = gtlabel
        self.gtscore = gtscore

    def initTestCase(self):
        self.b = 8
        self.h = 19
        self.w = 19
        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                        [59, 119], [116, 90], [156, 198], [373, 326]]
        self.anchor_mask = [6, 7, 8]
        self.na = len(self.anchor_mask)
        self.class_num = 80
        self.ignore_thresh = 0.7
        self.downsample_ratio = 32
        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
                        self.h, self.w)
        self.gtbox_shape = (self.b, 40, 4)
        self.gtscore = True
        self.use_label_smooth = False
        self.scale_x_y = 1.

    def test_loss(self):
        x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target
        yolo_loss = YOLOv3Loss(
            ignore_thresh=self.ignore_thresh,
            label_smooth=self.use_label_smooth,
            num_classes=self.class_num,
            downsample=self.downsample_ratio,
            scale_x_y=self.scale_x_y)
        x = paddle.to_tensor(x.astype(np.float32))
        gtbox = paddle.to_tensor(gtbox.astype(np.float32))
        gtlabel = paddle.to_tensor(gtlabel.astype(np.float32))
        gtscore = paddle.to_tensor(gtscore.astype(np.float32))
        t = paddle.to_tensor(target.astype(np.float32))
        anchor = [self.anchors[i] for i in self.anchor_mask]
        (yolo_loss1, px, py, tx, ty) = fine_grained_loss(
            output=x,
            target=t,
            gt_box=gtbox,
            batch_size=self.b,
            num_classes=self.class_num,
            anchors=self.mask_anchors,
            ignore_thresh=self.ignore_thresh,
            downsample=self.downsample_ratio,
            scale_x_y=self.scale_x_y)
        yolo_loss2 = yolo_loss.yolov3_loss(
            x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y)
        for k in yolo_loss2:
            self.assertAlmostEqual(
                float(yolo_loss1[k]), float(yolo_loss2[k]), delta=1e-2, msg=k)


class TestYolov3LossNoGTScore(TestYolov3LossOp):
    def initTestCase(self):
        self.b = 1
        self.h = 76
        self.w = 76
        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                        [59, 119], [116, 90], [156, 198], [373, 326]]
        self.anchor_mask = [0, 1, 2]
        self.na = len(self.anchor_mask)
        self.class_num = 80
        self.ignore_thresh = 0.7
        self.downsample_ratio = 8
        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
                        self.h, self.w)
        self.gtbox_shape = (self.b, 40, 4)
        self.gtscore = False
        self.use_label_smooth = False
        self.scale_x_y = 1.


class TestYolov3LossWithScaleXY(TestYolov3LossOp):
    def initTestCase(self):
        self.b = 5
        self.h = 38
        self.w = 38
        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                        [59, 119], [116, 90], [156, 198], [373, 326]]
        self.anchor_mask = [3, 4, 5]
        self.na = len(self.anchor_mask)
        self.class_num = 80
        self.ignore_thresh = 0.7
        self.downsample_ratio = 16
        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),
                        self.h, self.w)
        self.gtbox_shape = (self.b, 40, 4)
        self.gtscore = True
        self.use_label_smooth = False
        self.scale_x_y = 1.2


if __name__ == "__main__":
    unittest.main()


================================================
FILE: ppdet/modeling/transformers/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import detr_transformer
from . import utils
from . import matchers
from . import position_encoding
from . import deformable_transformer
from . import dino_transformer
from . import group_detr_transformer
from . import mask_dino_transformer
from . import rtdetr_transformer
from . import hybrid_encoder
from . import mask_rtdetr_transformer
from . import rtdetr_transformerv2
from . import rtdetr_transformerv3

from .detr_transformer import *
from .utils import *
from .matchers import *
from .position_encoding import *
from .deformable_transformer import *
from .dino_transformer import *
from .petr_transformer import *
from .group_detr_transformer import *
from .mask_dino_transformer import *
from .rtdetr_transformer import *
from .hybrid_encoder import *
from .mask_rtdetr_transformer import *
from .rtdetr_transformerv2 import *
from .rtdetr_transformerv3 import *

================================================
FILE: ppdet/modeling/transformers/deformable_transformer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .utils import _get_clones, get_valid_ratio
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_

__all__ = ['DeformableTransformer']


class MSDeformableAttention(nn.Layer):
    def __init__(self,
                 embed_dim=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 lr_mult=0.1):
        """
        Multi-Scale Deformable Attention Module
        """
        super(MSDeformableAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_levels = num_levels
        self.num_points = num_points
        self.total_points = num_heads * num_levels * num_points

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.sampling_offsets = nn.Linear(
            embed_dim,
            self.total_points * 2,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=ParamAttr(learning_rate=lr_mult))

        self.attention_weights = nn.Linear(embed_dim, self.total_points)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)
        try:
            # use cuda op
            from deformable_detr_ops import ms_deformable_attn
        except:
            # use paddle func
            from .utils import deformable_attention_core_func as ms_deformable_attn
        self.ms_deformable_attn_core = ms_deformable_attn

        self._reset_parameters()

    def _reset_parameters(self):
        # sampling_offsets
        constant_(self.sampling_offsets.weight)
        thetas = paddle.arange(
            self.num_heads,
            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
            [1, self.num_levels, self.num_points, 1])
        scaling = paddle.arange(
            1, self.num_points + 1,
            dtype=paddle.float32).reshape([1, 1, -1, 1])
        grid_init *= scaling
        self.sampling_offsets.bias.set_value(grid_init.flatten())
        # attention_weights
        constant_(self.attention_weights.weight)
        constant_(self.attention_weights.bias)
        # proj
        xavier_uniform_(self.value_proj.weight)
        constant_(self.value_proj.bias)
        xavier_uniform_(self.output_proj.weight)
        constant_(self.output_proj.bias)

    def forward(self,
                query,
                reference_points,
                value,
                value_spatial_shapes,
                value_level_start_index,
                value_mask=None):
        """
        Args:
            query (Tensor): [bs, query_length, C]
            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (Tensor): [bs, value_length, C]
            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, Len_q = query.shape[:2]
        Len_v = value.shape[1]
        assert int(value_spatial_shapes.prod(1).sum()) == Len_v

        value = self.value_proj(value)
        if value_mask is not None:
            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
            value *= value_mask
        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])

        sampling_offsets = self.sampling_offsets(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
        attention_weights = self.attention_weights(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
        attention_weights = F.softmax(attention_weights).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])

        if reference_points.shape[-1] == 2:
            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
                [1, 1, 1, self.num_levels, 1, 2])
            sampling_locations = reference_points.reshape([
                bs, Len_q, 1, self.num_levels, 1, 2
            ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype)
        elif reference_points.shape[-1] == 4:
            sampling_locations = (
                reference_points[:, :, None, :, None, :2] + sampling_offsets /
                self.num_points * reference_points[:, :, None, :, None, 2:] *
                0.5)
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".
                format(reference_points.shape[-1]))

        output = self.ms_deformable_attn_core(
            value, value_spatial_shapes, value_level_start_index,
            sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output


class DeformableTransformerEncoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.1,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 lr_mult=0.1,
                 weight_attr=None,
                 bias_attr=None):
        super(DeformableTransformerEncoderLayer, self).__init__()
        # self attention
        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
                                               n_points, lr_mult)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.activation = getattr(F, activation)
        self.dropout2 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.dropout3 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, src):
        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
        src = src + self.dropout3(src2)
        src = self.norm2(src)
        return src

    def forward(self,
                src,
                reference_points,
                spatial_shapes,
                level_start_index,
                src_mask=None,
                query_pos_embed=None):
        # self attention
        src2 = self.self_attn(
            self.with_pos_embed(src, query_pos_embed), reference_points, src,
            spatial_shapes, level_start_index, src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        # ffn
        src = self.forward_ffn(src)

        return src


class DeformableTransformerEncoder(nn.Layer):
    def __init__(self, encoder_layer, num_layers):
        super(DeformableTransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
        valid_ratios = valid_ratios.unsqueeze(1)
        reference_points = []
        for i, (H, W) in enumerate(spatial_shapes):
            ref_y, ref_x = paddle.meshgrid(
                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
                                                    H)
            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
                                                    W)
            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
        reference_points = reference_points * valid_ratios
        return reference_points

    def forward(self,
                feat,
                spatial_shapes,
                level_start_index,
                feat_mask=None,
                query_pos_embed=None,
                valid_ratios=None):
        if valid_ratios is None:
            valid_ratios = paddle.ones(
                [feat.shape[0], spatial_shapes.shape[0], 2])
        reference_points = self.get_reference_points(spatial_shapes,
                                                     valid_ratios)
        for layer in self.layers:
            feat = layer(feat, reference_points, spatial_shapes,
                         level_start_index, feat_mask, query_pos_embed)

        return feat


class DeformableTransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.1,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 lr_mult=0.1,
                 weight_attr=None,
                 bias_attr=None):
        super(DeformableTransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)

        # cross attention
        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
                                                n_points, lr_mult)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)
        return tgt

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        tgt2 = self.self_attn(q, k, value=tgt)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_level_start_index, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # ffn
        tgt = self.forward_ffn(tgt)

        return tgt


class DeformableTransformerDecoder(nn.Layer):
    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
        super(DeformableTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.return_intermediate = return_intermediate

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                memory_mask=None,
                query_pos_embed=None):
        output = tgt
        intermediate = []
        for lid, layer in enumerate(self.layers):
            output = layer(output, reference_points, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           memory_mask, query_pos_embed)

            if self.return_intermediate:
                intermediate.append(output)

        if self.return_intermediate:
            return paddle.stack(intermediate)

        return output.unsqueeze(0)


@register
class DeformableTransformer(nn.Layer):
    __shared__ = ['hidden_dim']

    def __init__(self,
                 num_queries=300,
                 position_embed_type='sine',
                 return_intermediate_dec=True,
                 in_feats_channel=[512, 1024, 2048],
                 num_feature_levels=4,
                 num_encoder_points=4,
                 num_decoder_points=4,
                 hidden_dim=256,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.1,
                 activation="relu",
                 lr_mult=0.1,
                 pe_temperature=10000,
                 pe_offset=-0.5):
        super(DeformableTransformer, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(in_feats_channel) <= num_feature_levels

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.num_feature_levels = num_feature_levels

        encoder_layer = DeformableTransformerEncoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation,
            num_feature_levels, num_encoder_points, lr_mult)
        self.encoder = DeformableTransformerEncoder(encoder_layer,
                                                    num_encoder_layers)

        decoder_layer = DeformableTransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation,
            num_feature_levels, num_decoder_points)
        self.decoder = DeformableTransformerDecoder(
            decoder_layer, num_decoder_layers, return_intermediate_dec)

        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)

        self.reference_points = nn.Linear(
            hidden_dim,
            2,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=ParamAttr(learning_rate=lr_mult))

        self.input_proj = nn.LayerList()
        for in_channels in in_feats_channel:
            self.input_proj.append(
                nn.Sequential(
                    nn.Conv2D(
                        in_channels, hidden_dim, kernel_size=1),
                    nn.GroupNorm(32, hidden_dim)))
        in_channels = in_feats_channel[-1]
        for _ in range(num_feature_levels - len(in_feats_channel)):
            self.input_proj.append(
                nn.Sequential(
                    nn.Conv2D(
                        in_channels,
                        hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1),
                    nn.GroupNorm(32, hidden_dim)))
            in_channels = hidden_dim

        self.position_embedding = PositionEmbedding(
            hidden_dim // 2,
            temperature=pe_temperature,
            normalize=True if position_embed_type == 'sine' else False,
            embed_type=position_embed_type,
            offset=pe_offset,
            eps=1e-4)

        self._reset_parameters()

    def _reset_parameters(self):
        normal_(self.level_embed.weight)
        normal_(self.tgt_embed.weight)
        normal_(self.query_pos_embed.weight)
        xavier_uniform_(self.reference_points.weight)
        constant_(self.reference_points.bias)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)
            constant_(l[0].bias)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_feats_channel': [i.channels for i in input_shape], }

    def forward(self, src_feats, src_mask=None, *args, **kwargs):
        srcs = []
        for i in range(len(src_feats)):
            srcs.append(self.input_proj[i](src_feats[i]))
        if self.num_feature_levels > len(srcs):
            len_srcs = len(srcs)
            for i in range(len_srcs, self.num_feature_levels):
                if i == len_srcs:
                    srcs.append(self.input_proj[i](src_feats[-1]))
                else:
                    srcs.append(self.input_proj[i](srcs[-1]))
        src_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        valid_ratios = []
        for level, src in enumerate(srcs):
            src_shape = paddle.shape(src)
            bs = src_shape[0:1]
            h = src_shape[2:3]
            w = src_shape[3:4]
            spatial_shapes.append(paddle.concat([h, w]))
            src = src.flatten(2).transpose([0, 2, 1])
            src_flatten.append(src)
            if src_mask is not None:
                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
            else:
                mask = paddle.ones([bs, h, w])
            valid_ratios.append(get_valid_ratio(mask))
            pos_embed = self.position_embedding(mask).flatten(1, 2)
            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            mask = mask.flatten(1)
            mask_flatten.append(mask)
        src_flatten = paddle.concat(src_flatten, 1)
        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
                                                                   1)
        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
        # [l, 2]
        spatial_shapes = paddle.to_tensor(
            paddle.stack(spatial_shapes).astype('int64'))
        # [l], 每一个level的起始index
        level_start_index = paddle.concat([
            paddle.zeros(
                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
        ])
        # [b, l, 2]
        valid_ratios = paddle.stack(valid_ratios, 1)

        # encoder
        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)

        # prepare input for decoder
        bs, _, c = memory.shape
        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        reference_points = F.sigmoid(self.reference_points(query_embed))
        reference_points_input = reference_points.unsqueeze(
            2) * valid_ratios.unsqueeze(1)

        # decoder
        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
                          level_start_index, mask_flatten, query_embed)

        return (hs, memory, reference_points)


class QRDeformableTransformerDecoder(DeformableTransformerDecoder):
    def __init__(self, decoder_layer, num_layers,
                 start_q=None, end_q=None, return_intermediate=False):
        super(QRDeformableTransformerDecoder, self).__init__(
            decoder_layer, num_layers, return_intermediate=return_intermediate)
        self.start_q = start_q
        self.end_q = end_q

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                memory_mask=None,
                query_pos_embed=None):

        if not self.training:
            return super(QRDeformableTransformerDecoder, self).forward(
                tgt, reference_points,
                memory, memory_spatial_shapes,
                memory_level_start_index,
                memory_mask=memory_mask,
                query_pos_embed=query_pos_embed)

        batchsize = tgt.shape[0]
        query_list_reserve = [tgt]
        intermediate = []
        for lid, layer in enumerate(self.layers):

            start_q = self.start_q[lid]
            end_q = self.end_q[lid]
            query_list = query_list_reserve.copy()[start_q:end_q]

            # prepare for parallel process
            output = paddle.concat(query_list, axis=0)
            fakesetsize = int(output.shape[0] / batchsize)
            reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1])

            memory_tiled = memory.tile([fakesetsize, 1, 1])
            query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1])
            memory_mask_tiled = memory_mask.tile([fakesetsize, 1])

            output = layer(output, reference_points_tiled, memory_tiled,
                           memory_spatial_shapes, memory_level_start_index,
                           memory_mask_tiled, query_pos_embed_tiled)

            for i in range(fakesetsize):
                query_list_reserve.append(output[batchsize*i:batchsize*(i+1)])

            if self.return_intermediate:
                for i in range(fakesetsize):
                    intermediate.append(output[batchsize*i:batchsize*(i+1)])

        if self.return_intermediate:
            return paddle.stack(intermediate)

        return output.unsqueeze(0)


@register
class QRDeformableTransformer(DeformableTransformer):

    def __init__(self,
                 num_queries=300,
                 position_embed_type='sine',
                 return_intermediate_dec=True,
                 in_feats_channel=[512, 1024, 2048],
                 num_feature_levels=4,
                 num_encoder_points=4,
                 num_decoder_points=4,
                 hidden_dim=256,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.1,
                 activation="relu",
                 lr_mult=0.1,
                 pe_temperature=10000,
                 pe_offset=-0.5,
                 start_q=None,
                 end_q=None):
        super(QRDeformableTransformer, self).__init__(
                 num_queries=num_queries,
                 position_embed_type=position_embed_type,
                 return_intermediate_dec=return_intermediate_dec,
                 in_feats_channel=in_feats_channel,
                 num_feature_levels=num_feature_levels,
                 num_encoder_points=num_encoder_points,
                 num_decoder_points=num_decoder_points,
                 hidden_dim=hidden_dim,
                 nhead=nhead,
                 num_encoder_layers=num_encoder_layers,
                 num_decoder_layers=num_decoder_layers,
                 dim_feedforward=dim_feedforward,
                 dropout=dropout,
                 activation=activation,
                 lr_mult=lr_mult,
                 pe_temperature=pe_temperature,
                 pe_offset=pe_offset)

        decoder_layer = DeformableTransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation,
            num_feature_levels, num_decoder_points)
        self.decoder = QRDeformableTransformerDecoder(
            decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec)


================================================
FILE: ppdet/modeling/transformers/detr_transformer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention, _convert_attention_mask
from .position_encoding import PositionEmbedding
from .utils import _get_clones
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_

__all__ = ['DETRTransformer']


class TransformerEncoderLayer(nn.Layer):
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(TransformerEncoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self, src, src_mask=None, pos_embed=None):
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        q = k = self.with_pos_embed(src, pos_embed)
        src = self.self_attn(q, k, value=src, attn_mask=src_mask)

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)
        return src


class TransformerEncoder(nn.Layer):
    def __init__(self, encoder_layer, num_layers, norm=None):
        super(TransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, src, src_mask=None, pos_embed=None):
        output = src
        for layer in self.layers:
            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)

        if self.norm is not None:
            output = self.norm(output)

        return output


class TransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                pos_embed=None,
                query_pos_embed=None):
        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        q = self.with_pos_embed(tgt, query_pos_embed)
        k = self.with_pos_embed(memory, pos_embed)
        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
        tgt = residual + self.dropout2(tgt)
        if not self.normalize_before:
            tgt = self.norm2(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm3(tgt)
        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = residual + self.dropout3(tgt)
        if not self.normalize_before:
            tgt = self.norm3(tgt)
        return tgt


class TransformerDecoder(nn.Layer):
    def __init__(self,
                 decoder_layer,
                 num_layers,
                 norm=None,
                 return_intermediate=False):
        super(TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm
        self.return_intermediate = return_intermediate

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                pos_embed=None,
                query_pos_embed=None):
        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)

        output = tgt
        intermediate = []
        for layer in self.layers:
            output = layer(
                output,
                memory,
                tgt_mask=tgt_mask,
                memory_mask=memory_mask,
                pos_embed=pos_embed,
                query_pos_embed=query_pos_embed)
            if self.return_intermediate:
                intermediate.append(self.norm(output))

        if self.norm is not None:
            output = self.norm(output)

        if self.return_intermediate:
            return paddle.stack(intermediate)

        return output.unsqueeze(0)


@register
class DETRTransformer(nn.Layer):
    __shared__ = ['hidden_dim']

    def __init__(self,
                 num_queries=100,
                 position_embed_type='sine',
                 return_intermediate_dec=True,
                 backbone_num_channels=2048,
                 hidden_dim=256,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 pe_temperature=10000,
                 pe_offset=0.,
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(DETRTransformer, self).__init__()
        assert position_embed_type in ['sine', 'learned'],\
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        self.hidden_dim = hidden_dim
        self.nhead = nhead

        encoder_layer = TransformerEncoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation,
            attn_dropout, act_dropout, normalize_before)
        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                          encoder_norm)

        decoder_layer = TransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation,
            attn_dropout, act_dropout, normalize_before)
        decoder_norm = nn.LayerNorm(hidden_dim)
        self.decoder = TransformerDecoder(
            decoder_layer,
            num_decoder_layers,
            decoder_norm,
            return_intermediate=return_intermediate_dec)

        self.input_proj = nn.Conv2D(
            backbone_num_channels, hidden_dim, kernel_size=1)
        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
        self.position_embedding = PositionEmbedding(
            hidden_dim // 2,
            temperature=pe_temperature,
            normalize=True if position_embed_type == 'sine' else False,
            embed_type=position_embed_type,
            offset=pe_offset)

        self._reset_parameters()

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
        conv_init_(self.input_proj)
        normal_(self.query_pos_embed.weight)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'backbone_num_channels': [i.channels for i in input_shape][-1],
        }

    def _convert_attention_mask(self, mask):
        return (mask - 1.0) * 1e9

    def forward(self, src, src_mask=None, *args, **kwargs):
        r"""
        Applies a Transformer model on the inputs.

        Parameters:
            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                [bs, H, W]`. When the data type is bool, the unwanted positions
                have `False` values and the others have `True` values. When the
                data type is int, the unwanted positions have 0 values and the
                others have 1 values. When the data type is float, the unwanted
                positions have `-INF` values and the others have 0 values. It
                can be None when nothing wanted or needed to be prevented
                attention to. Default None.

        Returns:
            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
            memory (Tensor): [batch_size, hidden_dim, h, w]
        """
        # use last level feature map
        src_proj = self.input_proj(src[-1])
        bs, c, h, w = src_proj.shape
        # flatten [B, C, H, W] to [B, HxW, C]
        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
        if src_mask is not None:
            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
        else:
            src_mask = paddle.ones([bs, h, w])
        pos_embed = self.position_embedding(src_mask).flatten(1, 2)

        if self.training:
            src_mask = self._convert_attention_mask(src_mask)
            src_mask = src_mask.reshape([bs, 1, 1, h * w])
        else:
            src_mask = None

        memory = self.encoder(
            src_flatten, src_mask=src_mask, pos_embed=pos_embed)

        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
            [bs, 1, 1])
        tgt = paddle.zeros_like(query_pos_embed)
        output = self.decoder(
            tgt,
            memory,
            memory_mask=src_mask,
            pos_embed=pos_embed,
            query_pos_embed=query_pos_embed)

        if self.training:
            src_mask = src_mask.reshape([bs, 1, 1, h, w])
        else:
            src_mask = None

        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
                src_proj, src_mask)


================================================
FILE: ppdet/modeling/transformers/dino_transformer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from ..heads.detr_head import MLP
from .deformable_transformer import (MSDeformableAttention,
                                     DeformableTransformerEncoderLayer,
                                     DeformableTransformerEncoder)
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
                           bias_init_with_prob)
from .utils import (_get_clones, get_valid_ratio,
                    get_contrastive_denoising_training_group,
                    get_sine_pos_embed, inverse_sigmoid)

__all__ = ['DINOTransformer']


class DINOTransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 lr_mult=1.0,
                 weight_attr=None,
                 bias_attr=None):
        super(DINOTransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)

        # cross attention
        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
                                                n_points, lr_mult)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                attn_mask=None,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        if attn_mask is not None:
            attn_mask = paddle.where(
                attn_mask.astype('bool'),
                paddle.zeros(attn_mask.shape, tgt.dtype),
                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_level_start_index, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # ffn
        tgt2 = self.forward_ffn(tgt)
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)

        return tgt


class DINOTransformerDecoder(nn.Layer):
    def __init__(self,
                 hidden_dim,
                 decoder_layer,
                 num_layers,
                 weight_attr=None,
                 bias_attr=None):
        super(DINOTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.norm = nn.LayerNorm(
            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)

    def forward(self,
                tgt,
                ref_points_unact,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                bbox_head,
                query_pos_head,
                valid_ratios=None,
                attn_mask=None,
                memory_mask=None):
        if valid_ratios is None:
            valid_ratios = paddle.ones(
                [memory.shape[0], memory_spatial_shapes.shape[0], 2])

        output = tgt
        intermediate = []
        inter_bboxes = []
        ref_points = F.sigmoid(ref_points_unact)
        for i, layer in enumerate(self.layers):
            reference_points_input = ref_points.detach().unsqueeze(
                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
            query_pos_embed = get_sine_pos_embed(
                reference_points_input[..., 0, :], self.hidden_dim // 2)
            query_pos_embed = query_pos_head(query_pos_embed)

            output = layer(output, reference_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)

            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                ref_points.detach()))

            intermediate.append(self.norm(output))
            inter_bboxes.append(ref_points)

        return paddle.stack(intermediate), paddle.stack(inter_bboxes)


@register
class DINOTransformer(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=900,
                 position_embed_type='sine',
                 in_feats_channel=[512, 1024, 2048],
                 num_levels=4,
                 num_encoder_points=4,
                 num_decoder_points=4,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 lr_mult=1.0,
                 pe_temperature=10000,
                 pe_offset=-0.5,
                 num_denoising=100,
                 label_noise_ratio=0.5,
                 box_noise_scale=1.0,
                 learnt_init_query=True,
                 eps=1e-2):
        super(DINOTransformer, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(in_feats_channel) <= num_levels

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers

        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
        # backbone feature projection
        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)

        # Transformer module
        encoder_layer = DeformableTransformerEncoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_encoder_points, lr_mult, weight_attr, bias_attr)
        self.encoder = DeformableTransformerEncoder(encoder_layer,
                                                    num_encoder_layers)
        decoder_layer = DINOTransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_decoder_points, lr_mult, weight_attr, bias_attr)
        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
                                              num_decoder_layers, weight_attr,
                                              bias_attr)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # position embedding
        self.position_embedding = PositionEmbedding(
            hidden_dim // 2,
            temperature=pe_temperature,
            normalize=True if position_embed_type == 'sine' else False,
            embed_type=position_embed_type,
            offset=pe_offset)
        self.level_embed = nn.Embedding(num_levels, hidden_dim)
        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(2 * hidden_dim,
                                  hidden_dim,
                                  hidden_dim,
                                  num_layers=2)

        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
        # decoder head
        self.dec_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(num_decoder_layers)
        ])
        self.dec_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
        constant_(self.enc_bbox_head.layers[-1].weight)
        constant_(self.enc_bbox_head.layers[-1].bias)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight)
            constant_(reg_.layers[-1].bias)

        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
        normal_(self.level_embed.weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)
            constant_(l[0].bias)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_feats_channel': [i.channels for i in input_shape], }

    def _build_input_proj_layer(self,
                                in_feats_channel,
                                weight_attr=None,
                                bias_attr=None):
        self.input_proj = nn.LayerList()
        for in_channels in in_feats_channel:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels, self.hidden_dim, kernel_size=1)), (
                            'norm', nn.GroupNorm(
                                32,
                                self.hidden_dim,
                                weight_attr=weight_attr,
                                bias_attr=bias_attr))))
        in_channels = in_feats_channel[-1]
        for _ in range(self.num_levels - len(in_feats_channel)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1)), ('norm', nn.GroupNorm(
                            32,
                            self.hidden_dim,
                            weight_attr=weight_attr,
                            bias_attr=bias_attr))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats, pad_mask=None):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        valid_ratios = []
        for i, feat in enumerate(proj_feats):
            bs, _, h, w = paddle.shape(feat)
            spatial_shapes.append(paddle.stack([h, w]))
            # [b,c,h,w] -> [b,h*w,c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            if pad_mask is not None:
                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
            else:
                mask = paddle.ones([bs, h, w])
            valid_ratios.append(get_valid_ratio(mask))
            # [b, h*w, c]
            pos_embed = self.position_embedding(mask).flatten(1, 2)
            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            if pad_mask is not None:
                # [b, h*w]
                mask_flatten.append(mask.flatten(1))

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        # [b, l]
        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
                                                                   1)
        # [b, l, c]
        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
        # [num_levels, 2]
        spatial_shapes = paddle.to_tensor(
            paddle.stack(spatial_shapes).astype('int64'))
        # [l] start index of each level
        level_start_index = paddle.concat([
            paddle.zeros(
                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
        ])
        # [b, num_levels, 2]
        valid_ratios = paddle.stack(valid_ratios, 1)
        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
                lvl_pos_embed_flatten, valid_ratios)

    def forward(self, feats, pad_mask=None, gt_meta=None):
        # input projection and embedding
        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
         lvl_pos_embed_flatten,
         valid_ratios) = self._get_encoder_input(feats, pad_mask)

        # encoder
        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                get_contrastive_denoising_training_group(gt_meta,
                                            self.num_classes,
                                            self.num_queries,
                                            self.denoising_class_embed.weight,
                                            self.num_denoising,
                                            self.label_noise_ratio,
                                            self.box_noise_scale)
        else:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None

        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
            self._get_decoder_input(
            memory, spatial_shapes, mask_flatten, denoising_class,
            denoising_bbox_unact)

        # decoder
        inter_feats, inter_bboxes = self.decoder(
            target, init_ref_points_unact, memory, spatial_shapes,
            level_start_index, self.dec_bbox_head, self.query_pos_head,
            valid_ratios, attn_mask, mask_flatten)
        out_bboxes = []
        out_logits = []
        for i in range(self.num_decoder_layers):
            out_logits.append(self.dec_score_head[i](inter_feats[i]))
            if i == 0:
                out_bboxes.append(
                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
                              init_ref_points_unact))
            else:
                out_bboxes.append(
                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
                              inverse_sigmoid(inter_bboxes[i - 1])))
        out_bboxes = paddle.stack(out_bboxes)
        out_logits = paddle.stack(out_logits)

        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
                dn_meta)

    def _get_encoder_output_anchors(self,
                                    memory,
                                    spatial_shapes,
                                    memory_mask=None,
                                    grid_size=0.05):
        output_anchors = []
        idx = 0
        for lvl, (h, w) in enumerate(spatial_shapes):
            if memory_mask is not None:
                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
                valid_H = paddle.sum(mask_[:, :, 0], 1)
                valid_W = paddle.sum(mask_[:, 0, :], 1)
            else:
                valid_H, valid_W = h, w

            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(end=h), paddle.arange(end=w))
            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)

            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
                [-1, 1, 1, 2]).astype(grid_xy.dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            output_anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
            idx += h * w

        output_anchors = paddle.concat(output_anchors, 1)
        valid_mask = ((output_anchors > self.eps) *
                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
        if memory_mask is not None:
            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
        output_anchors = paddle.where(valid_mask, output_anchors,
                                      paddle.to_tensor(float("inf")))

        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        output_memory = self.enc_output(memory)
        return output_memory, output_anchors

    def _get_decoder_input(self,
                           memory,
                           spatial_shapes,
                           memory_mask=None,
                           denoising_class=None,
                           denoising_bbox_unact=None):
        bs, _, _ = memory.shape
        # prepare input for decoder
        output_memory, output_anchors = self._get_encoder_output_anchors(
            memory, spatial_shapes, memory_mask)
        enc_outputs_class = self.enc_score_head(output_memory)
        enc_outputs_coord_unact = self.enc_bbox_head(
            output_memory) + output_anchors

        _, topk_ind = paddle.topk(
            enc_outputs_class.max(-1), self.num_queries, axis=1)
        # extract region proposal boxes
        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
                                                  topk_ind)  # unsigmoided.
        enc_topk_bboxes = F.sigmoid(reference_points_unact)
        if denoising_bbox_unact is not None:
            reference_points_unact = paddle.concat(
                [denoising_bbox_unact, reference_points_unact], 1)
        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)

        # extract region features
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        else:
            target = paddle.gather_nd(output_memory, topk_ind).detach()
        if denoising_class is not None:
            target = paddle.concat([denoising_class, target], 1)

        return target, reference_points_unact.detach(
        ), enc_topk_bboxes, enc_topk_logits


================================================
FILE: ppdet/modeling/transformers/ext_op/README.md
================================================
# Multi-scale deformable attention自定义OP编译
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。

## 1. 环境依赖
- Paddle >= 2.3.2
- gcc 8.2

## 2. 安装
请在当前路径下进行编译安装
```
cd PaddleDetection/ppdet/modeling/transformers/ext_op/
python setup_ms_deformable_attn_op.py install
```

编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
```
# 引入自定义op
from deformable_detr_ops import ms_deformable_attn

# 构造fake input tensor
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])

def get_test_tensors(channels):
    value = paddle.rand(
        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
    sampling_locations = paddle.rand(
        [bs, query_length, n_heads, n_levels, n_points, 2],
        dtype=paddle.float32)
    attention_weights = paddle.rand(
        [bs, query_length, n_heads, n_levels, n_points],
        dtype=paddle.float32) + 1e-5
    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
        -2, keepdim=True)
    return [value, sampling_locations, attention_weights]

value, sampling_locations, attention_weights = get_test_tensors(c)

output = ms_deformable_attn(value,
                            spatial_shapes,
                            level_start_index,
                            sampling_locations,
                            attention_weights)
```

## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
```
python test_ms_deformable_attn_op.py
```
运行成功后，打印如下：
```
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
*tensor1 True check_gradient_numerical(D=30)
*tensor2 True check_gradient_numerical(D=30)
*tensor3 True check_gradient_numerical(D=30)
*tensor1 True check_gradient_numerical(D=32)
*tensor2 True check_gradient_numerical(D=32)
*tensor3 True check_gradient_numerical(D=32)
*tensor1 True check_gradient_numerical(D=64)
*tensor2 True check_gradient_numerical(D=64)
*tensor3 True check_gradient_numerical(D=64)
*tensor1 True check_gradient_numerical(D=71)
*tensor2 True check_gradient_numerical(D=71)
*tensor3 True check_gradient_numerical(D=71)
*tensor1 True check_gradient_numerical(D=128)
*tensor2 True check_gradient_numerical(D=128)
*tensor3 True check_gradient_numerical(D=128)
*tensor1 True check_gradient_numerical(D=1024)
*tensor2 True check_gradient_numerical(D=1024)
*tensor3 True check_gradient_numerical(D=1024)
*tensor1 True check_gradient_numerical(D=1025)
*tensor2 True check_gradient_numerical(D=1025)
*tensor3 True check_gradient_numerical(D=1025)
*tensor1 True check_gradient_numerical(D=2048)
*tensor2 True check_gradient_numerical(D=2048)
*tensor3 True check_gradient_numerical(D=2048)
*tensor1 True check_gradient_numerical(D=3096)
*tensor2 True check_gradient_numerical(D=3096)
*tensor3 True check_gradient_numerical(D=3096)
```


================================================
FILE: ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
================================================
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/extension.h"

#include <vector>

// declare GPU implementation
std::vector<paddle::Tensor>
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
                            const paddle::Tensor &value_spatial_shapes,
                            const paddle::Tensor &value_level_start_index,
                            const paddle::Tensor &sampling_locations,
                            const paddle::Tensor &attention_weights);

std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
    const paddle::Tensor &value_level_start_index,
    const paddle::Tensor &sampling_locations,
    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);

//// CPU not implemented

std::vector<std::vector<int64_t>>
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
                           std::vector<int64_t> value_spatial_shapes_shape,
                           std::vector<int64_t> value_level_start_index_shape,
                           std::vector<int64_t> sampling_locations_shape,
                           std::vector<int64_t> attention_weights_shape) {
  return {{value_shape[0], sampling_locations_shape[1],
           value_shape[2] * value_shape[3]}};
}

std::vector<paddle::DataType>
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
                           paddle::DataType value_spatial_shapes_dtype,
                           paddle::DataType value_level_start_index_dtype,
                           paddle::DataType sampling_locations_dtype,
                           paddle::DataType attention_weights_dtype) {
  return {value_dtype};
}

PD_BUILD_OP(ms_deformable_attn)
    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
             "AttentionWeights"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));

PD_BUILD_GRAD_OP(ms_deformable_attn)
    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
             "AttentionWeights", paddle::Grad("Out")})
    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
              paddle::Grad("AttentionWeights")})
    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));


================================================
FILE: ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
================================================
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/extension.h"

#define CUDA_KERNEL_LOOP(i, n)                                                 \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
       i += blockDim.x * gridDim.x)

const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
  return (N + num_threads - 1) / num_threads;
}

// forward bilinear
template <typename data_t>
__device__ data_t deformable_attn_bilinear_forward(
    const data_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const data_t &h, const data_t &w,
    const int &m, const int &c) {
  const int h_low = floor(h);
  const int w_low = floor(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const data_t lh = h - h_low;
  const data_t lw = w - w_low;
  const data_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  data_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
  }
  data_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
  }
  data_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
  }
  data_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
  }

  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

// forward kernel
template <typename data_t>
__global__ void deformable_attn_cuda_kernel_forward(
    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
    const data_t *data_attn_weight, const int batch_size,
    const int value_length, const int num_heads, const int channels,
    const int num_levels, const int query_length, const int num_points,
    data_t *output_data_ptr) {
  CUDA_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    data_t *data_ptr = output_data_ptr + index;
    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
    data_t col = 0;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
                                                   level_start_id * qid_stride);
      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;

        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          col += deformable_attn_bilinear_forward(
                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
                     h_im, w_im, m_col, c_col) *
                 weight;
        }

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
      }
    }
    *data_ptr = col;
  }
}

#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
// forward
std::vector<paddle::Tensor>
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
                            const paddle::Tensor &value_spatial_shapes,
                            const paddle::Tensor &value_level_start_index,
                            const paddle::Tensor &sampling_locations,
                            const paddle::Tensor &attention_weights) {

  CHECK_INPUT_GPU(value);
  CHECK_INPUT_GPU(value_spatial_shapes);
  CHECK_INPUT_GPU(value_level_start_index);
  CHECK_INPUT_GPU(sampling_locations);
  CHECK_INPUT_GPU(attention_weights);

  const int batch_size = value.shape()[0];
  const int value_length = value.shape()[1];
  const int num_heads = value.shape()[2];
  const int channels = value.shape()[3];

  const int num_levels = value_spatial_shapes.shape()[0];
  const int query_length = sampling_locations.shape()[1];
  const int num_points = sampling_locations.shape()[4];

  auto output = paddle::full({batch_size, query_length, num_heads * channels},
                             0, value.dtype(), paddle::GPUPlace());

  const int num_kernels = batch_size * query_length * num_heads * channels;
  deformable_attn_cuda_kernel_forward<float>
      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
         value.stream()>>>(num_kernels, value.data<float>(),
                           value_spatial_shapes.data<int64_t>(),
                           value_level_start_index.data<int64_t>(),
                           sampling_locations.data<float>(),
                           attention_weights.data<float>(), batch_size,
                           value_length, num_heads, channels, num_levels,
                           query_length, num_points, output.data<float>());
  return {output};
}

// backward bilinear
template <typename data_t>
__device__ void deformable_attn_bilinear_backward(
    const data_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const data_t &h, const data_t &w,
    const int &m, const int &c, const data_t &top_grad,
    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  const int h_low = floor(h);
  const int w_low = floor(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const data_t lh = h - h_low;
  const data_t lw = w - w_low;
  const data_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const data_t top_grad_value = top_grad * attn_weight;
  data_t grad_h_weight = 0, grad_w_weight = 0;

  data_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  data_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  data_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  data_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  *grad_attn_weight = top_grad * val;
  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}

template <typename data_t>
__device__ void deformable_attn_bilinear_backward_gm(
    const data_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const data_t &h, const data_t &w,
    const int &m, const int &c, const data_t &top_grad,
    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  const int h_low = floor(h);
  const int w_low = floor(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const data_t lh = h - h_low;
  const data_t lw = w - w_low;
  const data_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const data_t top_grad_value = top_grad * attn_weight;
  data_t grad_h_weight = 0, grad_w_weight = 0;

  data_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  data_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  data_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  data_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  atomicAdd(grad_attn_weight, top_grad * val);
  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}

// backward kernels
// channels > 1024
template <typename data_t>
__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    extern __shared__ int _s[];
    data_t *cache_grad_sampling_loc = (data_t *)_s;
    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
    unsigned int tid = threadIdx.x;
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

template <typename data_t>
__global__ void deformable_attn_cuda_kernel_backward_gm(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward_gm(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              grad_sampling_loc, grad_attn_weight);
        }
        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

// channels <= 1024
template <typename data_t, unsigned int blockSize>
__global__ void
deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
    __shared__ data_t cache_grad_attn_weight[blockSize];
    unsigned int tid = threadIdx.x;
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          data_t _grad_w = cache_grad_sampling_loc[0],
                 _grad_h = cache_grad_sampling_loc[1],
                 _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int tid = 1; tid < blockSize; ++tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[tid];
            sid += 2;
          }

          *grad_sampling_loc = _grad_w;
          *(grad_sampling_loc + 1) = _grad_h;
          *grad_attn_weight = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

template <typename data_t, unsigned int blockSize>
__global__ void
deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
    __shared__ data_t cache_grad_attn_weight[blockSize];
    unsigned int tid = threadIdx.x;
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc = cache_grad_sampling_loc[0];
          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

template <typename data_t>
__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    extern __shared__ int _s[];
    data_t *cache_grad_sampling_loc = (data_t *)_s;
    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
    unsigned int tid = threadIdx.x;
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          data_t _grad_w = cache_grad_sampling_loc[0],
                 _grad_h = cache_grad_sampling_loc[1],
                 _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[tid];
            sid += 2;
          }

          *grad_sampling_loc = _grad_w;
          *(grad_sampling_loc + 1) = _grad_h;
          *grad_attn_weight = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

template <typename data_t>
__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
    const int n, const data_t *grad_col, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  CUDA_KERNEL_LOOP(index, n) {
    extern __shared__ int _s[];
    data_t *cache_grad_sampling_loc = (data_t *)_s;
    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
    unsigned int tid = threadIdx.x;
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % query_length;
    _temp /= query_length;
    const int b_col = _temp;

    const data_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_points;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    grad_sampling_loc += grad_sampling_ptr << 1;
    grad_attn_weight += grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const data_t *data_value_ptr = data_value + value_ptr_offset;
      data_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_points; ++p_col) {
        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const data_t weight = data_attn_weight[data_weight_ptr];

        const data_t h_im = loc_h * spatial_h - 0.5;
        const data_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          deformable_attn_bilinear_backward(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc = cache_grad_sampling_loc[0];
          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight += grad_weight_stride;
        grad_sampling_loc += grad_loc_stride;
      }
    }
  }
}

// backward branch
template <typename data_t>
void deformable_attn_cuda_backward(
    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const data_t *data_sampling_loc, const data_t *data_attn_weight,
    const int batch_size, const int value_length, const int num_heads,
    const int channels, const int num_levels, const int query_length,
    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
    data_t *grad_attn_weight) {
  const int num_threads =
      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
  const int num_kernels = batch_size * query_length * num_heads * channels;
  const int num_actual_kernels =
      batch_size * query_length * num_heads * channels;
  if (channels > 1024) {
    if ((channels & 1023) == 0) {
      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
             num_threads * 3 * sizeof(data_t), stream>>>(
              num_kernels, grad_out, data_value, data_spatial_shapes,
              data_level_start_index, data_sampling_loc, data_attn_weight,
              batch_size, value_length, num_heads, channels, num_levels,
              query_length, num_points, grad_value, grad_sampling_loc,
              grad_attn_weight);
    } else {
      deformable_attn_cuda_kernel_backward_gm<data_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
    }
  } else {
    switch (channels) {
    case 1:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         1>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 2:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         2>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 4:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         4>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 8:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         8>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 16:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         16>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 32:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
                                                                         32>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 64:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
                                                                         64>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 128:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
                                                                         128>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 256:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
                                                                         256>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 512:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
                                                                         512>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    case 1024:
      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
                                                                         1024>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, value_length, num_heads,
                       channels, num_levels, query_length, num_points,
                       grad_value, grad_sampling_loc, grad_attn_weight);
      break;
    default:
      if (channels < 64) {
        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
               num_threads * 3 * sizeof(data_t), stream>>>(
                num_kernels, grad_out, data_value, data_spatial_shapes,
                data_level_start_index, data_sampling_loc, data_attn_weight,
                batch_size, value_length, num_heads, channels, num_levels,
                query_length, num_points, grad_value, grad_sampling_loc,
                grad_attn_weight);
      } else {
        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
               num_threads * 3 * sizeof(data_t), stream>>>(
                num_kernels, grad_out, data_value, data_spatial_shapes,
                data_level_start_index, data_sampling_loc, data_attn_weight,
                batch_size, value_length, num_heads, channels, num_levels,
                query_length, num_points, grad_value, grad_sampling_loc,
                grad_attn_weight);
      }
    }
  }
}

// backward
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
    const paddle::Tensor &value_level_start_index,
    const paddle::Tensor &sampling_locations,
    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {

  CHECK_INPUT_GPU(value);
  CHECK_INPUT_GPU(value_spatial_shapes);
  CHECK_INPUT_GPU(value_level_start_index);
  CHECK_INPUT_GPU(sampling_locations);
  CHECK_INPUT_GPU(attention_weights);
  CHECK_INPUT_GPU(grad_out);

  const int batch_size = value.shape()[0];
  const int value_length = value.shape()[1];
  const int num_heads = value.shape()[2];
  const int channels = value.shape()[3];

  const int num_levels = value_spatial_shapes.shape()[0];
  const int query_length = sampling_locations.shape()[1];
  const int num_points = sampling_locations.shape()[4];

  auto grad_value =
      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
  auto grad_spatial_shapes =
      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
  auto grad_level_start_index =
      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
  auto grad_sampling_locations =
      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
                   paddle::GPUPlace());
  auto grad_attention_weights =
      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
                   paddle::GPUPlace());

  deformable_attn_cuda_backward<float>(
      value.stream(), grad_out.data<float>(), value.data<float>(),
      value_spatial_shapes.data<int64_t>(),
      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
      attention_weights.data<float>(), batch_size, value_length, num_heads,
      channels, num_levels, query_length, num_points, grad_value.data<float>(),
      grad_sampling_locations.data<float>(),
      grad_attention_weights.data<float>());

  return {grad_value, grad_spatial_shapes, grad_level_start_index,
          grad_sampling_locations, grad_attention_weights};
}


================================================
FILE: ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
================================================
from paddle.utils.cpp_extension import CUDAExtension, setup

if __name__ == "__main__":
    setup(
        name='deformable_detr_ops',
        ext_modules=CUDAExtension(
            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))


================================================
FILE: ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import os
import sys
import random
import numpy as np
import paddle
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
if parent_path not in sys.path:
    sys.path.append(parent_path)

from ppdet.modeling.transformers.utils import deformable_attention_core_func
ms_deform_attn_core_paddle = deformable_attention_core_func

try:
    gpu_index = int(sys.argv[1])
except:
    gpu_index = 0
print(f'Use gpu {gpu_index} to test...')
paddle.set_device(f'gpu:{gpu_index}')

try:
    from deformable_detr_ops import ms_deformable_attn
except Exception as e:
    print('import deformable_detr_ops error', e)
    sys.exit(-1)

paddle.seed(1)
random.seed(1)
np.random.seed(1)

bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])


def get_test_tensors(channels):
    value = paddle.rand(
        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
    sampling_locations = paddle.rand(
        [bs, query_length, n_heads, n_levels, n_points, 2],
        dtype=paddle.float32)
    attention_weights = paddle.rand(
        [bs, query_length, n_heads, n_levels, n_points],
        dtype=paddle.float32) + 1e-5
    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
        -2, keepdim=True)

    return [value, sampling_locations, attention_weights]


@paddle.no_grad()
def check_forward_equal_with_paddle_float():
    value, sampling_locations, attention_weights = get_test_tensors(c)

    output_paddle = ms_deform_attn_core_paddle(
        value, spatial_shapes, level_start_index, sampling_locations,
        attention_weights).detach().cpu()
    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
                                     sampling_locations,
                                     attention_weights).detach().cpu()
    fwdok = paddle.allclose(
        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
    max_abs_err = (output_cuda - output_paddle).abs().max().item()
    max_rel_err = (
        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()

    print(
        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
    )


def check_gradient_numerical(channels=4):
    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
        channels)
    value_paddle.stop_gradient = False
    sampling_locations_paddle.stop_gradient = False
    attention_weights_paddle.stop_gradient = False

    value_cuda = value_paddle.detach().clone()
    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
    attention_weights_cuda = attention_weights_paddle.detach().clone()
    value_cuda.stop_gradient = False
    sampling_locations_cuda.stop_gradient = False
    attention_weights_cuda.stop_gradient = False

    output_paddle = ms_deform_attn_core_paddle(
        value_paddle, spatial_shapes, level_start_index,
        sampling_locations_paddle, attention_weights_paddle)
    output_paddle.sum().backward()

    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
                                     level_start_index, sampling_locations_cuda,
                                     attention_weights_cuda)
    output_cuda.sum().backward()

    res = paddle.allclose(
        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')

    res = paddle.allclose(
        sampling_locations_paddle.grad,
        sampling_locations_cuda.grad,
        rtol=1e-2,
        atol=1e-3).item()
    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')

    res = paddle.allclose(
        attention_weights_paddle.grad,
        attention_weights_cuda.grad,
        rtol=1e-2,
        atol=1e-3).item()
    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')


if __name__ == '__main__':
    check_forward_equal_with_paddle_float()

    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
        check_gradient_numerical(channels)


================================================
FILE: ppdet/modeling/transformers/group_detr_transformer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from ..heads.detr_head import MLP
from .deformable_transformer import MSDeformableAttention
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
                           bias_init_with_prob)
from .utils import (_get_clones, get_valid_ratio,
                    get_contrastive_denoising_training_group,
                    get_sine_pos_embed, inverse_sigmoid)

__all__ = ['GroupDINOTransformer']


class DINOTransformerEncoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 weight_attr=None,
                 bias_attr=None):
        super(DINOTransformerEncoderLayer, self).__init__()
        # self attention
        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
                                               n_points, 1.0)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
                                 bias_attr)
        self.activation = getattr(F, activation)
        self.dropout2 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
                                 bias_attr)
        self.dropout3 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, src):
        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
        src = src + self.dropout3(src2)
        src = self.norm2(src)
        return src

    def forward(self,
                src,
                reference_points,
                spatial_shapes,
                level_start_index,
                src_mask=None,
                query_pos_embed=None):
        # self attention
        src2 = self.self_attn(
            self.with_pos_embed(src, query_pos_embed), reference_points, src,
            spatial_shapes, level_start_index, src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        # ffn
        src = self.forward_ffn(src)

        return src


class DINOTransformerEncoder(nn.Layer):
    def __init__(self, encoder_layer, num_layers):
        super(DINOTransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
        valid_ratios = valid_ratios.unsqueeze(1)
        reference_points = []
        for i, (H, W) in enumerate(spatial_shapes):
            ref_y, ref_x = paddle.meshgrid(
                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
                                                    H)
            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
                                                    W)
            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
        reference_points = reference_points * valid_ratios
        return reference_points

    def forward(self,
                feat,
                spatial_shapes,
                level_start_index,
                feat_mask=None,
                query_pos_embed=None,
                valid_ratios=None):
        if valid_ratios is None:
            valid_ratios = paddle.ones(
                [feat.shape[0], spatial_shapes.shape[0], 2])
        reference_points = self.get_reference_points(spatial_shapes,
                                                     valid_ratios)
        for layer in self.layers:
            feat = layer(feat, reference_points, spatial_shapes,
                         level_start_index, feat_mask, query_pos_embed)

        return feat


class DINOTransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 dual_queries=False,
                 dual_groups=0,
                 weight_attr=None,
                 bias_attr=None):
        super(DINOTransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # cross attention
        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
                                                n_points, 1.0)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
                                 bias_attr)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
                                 bias_attr)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # for dual groups 
        self.dual_queries = dual_queries
        self.dual_groups = dual_groups
        self.n_head = n_head

        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                attn_mask=None,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        if self.dual_queries:
            dual_groups = self.dual_groups
            bs, num_queries, n_model = q.shape
            q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0)
            k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0)
            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0)

            g_num_queries = num_queries // (dual_groups + 1)
            if attn_mask is None or attn_mask[0] is None:
                attn_mask = None
            else:
                # [(dual_groups + 1), g_num_queries, g_num_queries]
                attn_mask = paddle.concat(
                    [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0)
                # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries]
                # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries]
                # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries]
                attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile(
                    [bs, 1, self.n_head, 1, 1])
                attn_mask = attn_mask.reshape([
                    bs * (dual_groups + 1), self.n_head, g_num_queries,
                    g_num_queries
                ])

        if attn_mask is not None:
            attn_mask = attn_mask.astype('bool')

        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm2(tgt)

        # trace back
        if self.dual_queries:
            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_level_start_index, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm1(tgt)

        # ffn
        tgt2 = self.forward_ffn(tgt)
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)

        return tgt


class DINOTransformerDecoder(nn.Layer):
    def __init__(self,
                 hidden_dim,
                 decoder_layer,
                 num_layers,
                 return_intermediate=True):
        super(DINOTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.return_intermediate = return_intermediate

        self.norm = nn.LayerNorm(
            hidden_dim,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                bbox_head,
                query_pos_head,
                valid_ratios=None,
                attn_mask=None,
                memory_mask=None):
        if valid_ratios is None:
            valid_ratios = paddle.ones(
                [memory.shape[0], memory_spatial_shapes.shape[0], 2])

        output = tgt
        intermediate = []
        inter_ref_bboxes = []
        for i, layer in enumerate(self.layers):
            reference_points_input = reference_points.unsqueeze(
                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
            query_pos_embed = get_sine_pos_embed(
                reference_points_input[..., 0, :], self.hidden_dim // 2)
            query_pos_embed = query_pos_head(query_pos_embed)

            output = layer(output, reference_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)
            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                reference_points))

            if self.return_intermediate:
                intermediate.append(self.norm(output))
                inter_ref_bboxes.append(inter_ref_bbox)

            reference_points = inter_ref_bbox.detach()

        if self.return_intermediate:
            return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes)

        return output, reference_points


@register
class GroupDINOTransformer(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=900,
                 position_embed_type='sine',
                 return_intermediate_dec=True,
                 backbone_feat_channels=[512, 1024, 2048],
                 num_levels=4,
                 num_encoder_points=4,
                 num_decoder_points=4,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 pe_temperature=10000,
                 pe_offset=-0.5,
                 num_denoising=100,
                 label_noise_ratio=0.5,
                 box_noise_scale=1.0,
                 learnt_init_query=True,
                 use_input_proj=True,
                 dual_queries=False,
                 dual_groups=0,
                 eps=1e-2):
        super(GroupDINOTransformer, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(backbone_feat_channels) <= num_levels

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.use_input_proj = use_input_proj

        if use_input_proj:
            # backbone feature projection
            self._build_input_proj_layer(backbone_feat_channels)

        # Transformer module
        encoder_layer = DINOTransformerEncoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_encoder_points)
        self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers)
        decoder_layer = DINOTransformerDecoderLayer(
            hidden_dim,
            nhead,
            dim_feedforward,
            dropout,
            activation,
            num_levels,
            num_decoder_points,
            dual_queries=dual_queries,
            dual_groups=dual_groups)
        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
                                              num_decoder_layers,
                                              return_intermediate_dec)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # for dual group
        self.dual_queries = dual_queries
        self.dual_groups = dual_groups
        if self.dual_queries:
            self.denoising_class_embed_groups = nn.LayerList([
                nn.Embedding(
                    num_classes,
                    hidden_dim,
                    weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
                for _ in range(self.dual_groups)
            ])

        # position embedding
        self.position_embedding = PositionEmbedding(
            hidden_dim // 2,
            temperature=pe_temperature,
            normalize=True if position_embed_type == 'sine' else False,
            embed_type=position_embed_type,
            offset=pe_offset)
        self.level_embed = nn.Embedding(num_levels, hidden_dim)
        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
            normal_(self.tgt_embed.weight)
            if self.dual_queries:
                self.tgt_embed_dual = nn.LayerList([
                    nn.Embedding(num_queries, hidden_dim)
                    for _ in range(self.dual_groups)
                ])
                for dual_tgt_module in self.tgt_embed_dual:
                    normal_(dual_tgt_module.weight)
        self.query_pos_head = MLP(2 * hidden_dim,
                                  hidden_dim,
                                  hidden_dim,
                                  num_layers=2)

        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
        if self.dual_queries:
            self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1)
        else:
            self.enc_output = _get_clones(self.enc_output, 1)

        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)

        if self.dual_queries:
            self.enc_bbox_head_dq = nn.LayerList([
                MLP(hidden_dim, hidden_dim, 4, num_layers=3)
                for i in range(self.dual_groups)
            ])
            self.enc_score_head_dq = nn.LayerList([
                nn.Linear(hidden_dim, num_classes)
                for i in range(self.dual_groups)
            ])

        # decoder head
        self.dec_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(num_decoder_layers)
        ])
        self.dec_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
        constant_(self.enc_bbox_head.layers[-1].weight)
        constant_(self.enc_bbox_head.layers[-1].bias)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight)
            constant_(reg_.layers[-1].bias)

        for enc_output in self.enc_output:
            linear_init_(enc_output[0])
            xavier_uniform_(enc_output[0].weight)
        normal_(self.level_embed.weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        normal_(self.denoising_class_embed.weight)
        if self.use_input_proj:
            for l in self.input_proj:
                xavier_uniform_(l[0].weight)
                constant_(l[0].bias)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'backbone_feat_channels': [i.channels for i in input_shape], }

    def _build_input_proj_layer(self, backbone_feat_channels):
        self.input_proj = nn.LayerList()
        for in_channels in backbone_feat_channels:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels, self.hidden_dim, kernel_size=1)),
                    ('norm', nn.GroupNorm(
                        32,
                        self.hidden_dim,
                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
        in_channels = backbone_feat_channels[-1]
        for _ in range(self.num_levels - len(backbone_feat_channels)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1)), ('norm', nn.GroupNorm(
                            32,
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats, pad_mask=None):
        if self.use_input_proj:
            # get projection features
            proj_feats = [
                self.input_proj[i](feat) for i, feat in enumerate(feats)
            ]
            if self.num_levels > len(proj_feats):
                len_srcs = len(proj_feats)
                for i in range(len_srcs, self.num_levels):
                    if i == len_srcs:
                        proj_feats.append(self.input_proj[i](feats[-1]))
                    else:
                        proj_feats.append(self.input_proj[i](proj_feats[-1]))
        else:
            proj_feats = feats
        # get encoder inputs
        feat_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        valid_ratios = []
        for i, feat in enumerate(proj_feats):
            bs, _, h, w = feat.shape
            spatial_shapes.append(paddle.concat([h, w]))
            # [b,c,h,w] -> [b,h*w,c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            if pad_mask is not None:
                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
            else:
                mask = paddle.ones([bs, h, w])
            valid_ratios.append(get_valid_ratio(mask))
            # [b, h*w, c]
            pos_embed = self.position_embedding(mask).flatten(1, 2)
            lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape(
                [1, 1, -1])
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            if pad_mask is not None:
                # [b, h*w]
                mask_flatten.append(mask.flatten(1))

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        # [b, l]
        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
                                                                   1)
        # [b, l, c]
        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
        # [num_levels, 2]
        spatial_shapes = paddle.to_tensor(
            paddle.stack(spatial_shapes).astype('int64'))
        # [l] start index of each level
        level_start_index = paddle.concat([
            paddle.zeros(
                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
        ])
        # [b, num_levels, 2]
        valid_ratios = paddle.stack(valid_ratios, 1)
        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
                lvl_pos_embed_flatten, valid_ratios)

    def forward(self, feats, pad_mask=None, gt_meta=None):
        # input projection and embedding
        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
         lvl_pos_embed_flatten,
         valid_ratios) = self._get_encoder_input(feats, pad_mask)

        # encoder
        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox, attn_mask, dn_meta = \
                get_contrastive_denoising_training_group(gt_meta,
                                            self.num_classes,
                                            self.num_queries,
                                            self.denoising_class_embed.weight,
                                            self.num_denoising,
                                            self.label_noise_ratio,
                                            self.box_noise_scale)
            if self.dual_queries:
                denoising_class_groups = []
                denoising_bbox_groups = []
                attn_mask_groups = []
                dn_meta_groups = []
                for g_id in range(self.dual_groups):
                    denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \
                        get_contrastive_denoising_training_group(gt_meta,
                                                    self.num_classes,
                                                    self.num_queries,
                                                    self.denoising_class_embed_groups[g_id].weight,
                                                    self.num_denoising,
                                                    self.label_noise_ratio,
                                                    self.box_noise_scale)
                    denoising_class_groups.append(denoising_class_gid)
                    denoising_bbox_groups.append(denoising_bbox_gid)
                    attn_mask_groups.append(attn_mask_gid)
                    dn_meta_groups.append(dn_meta_gid)

                # combine
                denoising_class = [denoising_class] + denoising_class_groups
                denoising_bbox = [denoising_bbox] + denoising_bbox_groups
                attn_mask = [attn_mask] + attn_mask_groups
                dn_meta = [dn_meta] + dn_meta_groups
        else:
            denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None

        target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \
            self._get_decoder_input(
            memory, spatial_shapes, mask_flatten, denoising_class,
            denoising_bbox)

        # decoder
        inter_feats, inter_ref_bboxes = self.decoder(
            target, init_ref_points, memory, spatial_shapes, level_start_index,
            self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask,
            mask_flatten)
        # solve hang during distributed training
        inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0.
        if self.dual_queries:
            for g_id in range(self.dual_groups):
                inter_feats[0] += self.denoising_class_embed_groups[
                    g_id].weight[0, 0] * 0.0

        out_bboxes = []
        out_logits = []
        for i in range(self.num_decoder_layers):
            out_logits.append(self.dec_score_head[i](inter_feats[i]))
            if i == 0:
                out_bboxes.append(
                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
                              inverse_sigmoid(init_ref_points)))
            else:
                out_bboxes.append(
                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
                              inverse_sigmoid(inter_ref_bboxes[i - 1])))

        out_bboxes = paddle.stack(out_bboxes)
        out_logits = paddle.stack(out_logits)
        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
                dn_meta)

    def _get_encoder_output_anchors(self,
                                    memory,
                                    spatial_shapes,
                                    memory_mask=None,
                                    grid_size=0.05):
        output_anchors = []
        idx = 0
        for lvl, (h, w) in enumerate(spatial_shapes):
            if memory_mask is not None:
                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
                valid_H = paddle.sum(mask_[:, :, 0], 1)
                valid_W = paddle.sum(mask_[:, 0, :], 1)
            else:
                valid_H, valid_W = h, w

            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(
                    end=h, dtype=memory.dtype),
                paddle.arange(
                    end=w, dtype=memory.dtype))
            grid_xy = paddle.stack([grid_x, grid_y], -1)

            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
                [-1, 1, 1, 2]).astype(grid_xy.dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            output_anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
            idx += h * w

        output_anchors = paddle.concat(output_anchors, 1)
        valid_mask = ((output_anchors > self.eps) *
                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
        if memory_mask is not None:
            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
        output_anchors = paddle.where(valid_mask, output_anchors,
                                      paddle.to_tensor(float("inf")))

        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        if self.dual_queries:
            output_memory = [
                self.enc_output[g_id](memory)
                for g_id in range(self.dual_groups + 1)
            ]
        else:
            output_memory = self.enc_output[0](memory)
        return output_memory, output_anchors

    def _get_decoder_input(self,
                           memory,
                           spatial_shapes,
                           memory_mask=None,
                           denoising_class=None,
                           denoising_bbox=None):
        bs, _, _ = memory.shape
        # prepare input for decoder
        output_memory, output_anchors = self._get_encoder_output_anchors(
            memory, spatial_shapes, memory_mask)
        if self.dual_queries:
            enc_outputs_class = self.enc_score_head(output_memory[0])
            enc_outputs_coord_unact = self.enc_bbox_head(output_memory[
                0]) + output_anchors
        else:
            enc_outputs_class = self.enc_score_head(output_memory)
            enc_outputs_coord_unact = self.enc_bbox_head(
                output_memory) + output_anchors

        _, topk_ind = paddle.topk(
            enc_outputs_class.max(-1), self.num_queries, axis=1)
        # extract region proposal boxes
        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
        topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact,
                                             topk_ind)  # unsigmoided.
        enc_topk_bboxes = F.sigmoid(topk_coords_unact)
        reference_points = enc_topk_bboxes.detach()
        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)

        if self.dual_queries:
            enc_topk_logits_groups = []
            enc_topk_bboxes_groups = []
            reference_points_groups = []
            topk_ind_groups = []
            for g_id in range(self.dual_groups):
                enc_outputs_class_gid = self.enc_score_head_dq[g_id](
                    output_memory[g_id + 1])
                enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id](
                    output_memory[g_id + 1]) + output_anchors
                _, topk_ind_gid = paddle.topk(
                    enc_outputs_class_gid.max(-1), self.num_queries, axis=1)
                # extract region proposal boxes
                batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype)
                batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
                topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1)
                topk_coords_unact_gid = paddle.gather_nd(
                    enc_outputs_coord_unact_gid, topk_ind_gid)  # unsigmoided.
                enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid)
                reference_points_gid = enc_topk_bboxes_gid.detach()
                enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid,
                                                       topk_ind_gid)

                # append and combine
                topk_ind_groups.append(topk_ind_gid)
                enc_topk_logits_groups.append(enc_topk_logits_gid)
                enc_topk_bboxes_groups.append(enc_topk_bboxes_gid)
                reference_points_groups.append(reference_points_gid)

            enc_topk_bboxes = paddle.concat(
                [enc_topk_bboxes] + enc_topk_bboxes_groups, 1)
            enc_topk_logits = paddle.concat(
                [enc_topk_logits] + enc_topk_logits_groups, 1)
            reference_points = paddle.concat(
                [reference_points] + reference_points_groups, 1)
            topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1)

        # extract region features
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
            if self.dual_queries:
                target = paddle.concat([target] + [
                    self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile(
                        [bs, 1, 1]) for g_id in range(self.dual_groups)
                ], 1)
        else:
            if self.dual_queries:
                target = paddle.gather_nd(output_memory[0], topk_ind)
                target_groups = []
                for g_id in range(self.dual_groups):
                    target_gid = paddle.gather_nd(output_memory[g_id + 1],
                                                  topk_ind_groups[g_id])
                    target_groups.append(target_gid)
                target = paddle.concat([target] + target_groups, 1).detach()
            else:
                target = paddle.gather_nd(output_memory, topk_ind).detach()

        if denoising_bbox is not None:
            if isinstance(denoising_bbox, list) and isinstance(
                    denoising_class, list) and self.dual_queries:
                if denoising_bbox[0] is not None:
                    reference_points_list = paddle.split(
                        reference_points, self.dual_groups + 1, axis=1)
                    reference_points = paddle.concat(
                        [
                            paddle.concat(
                                [ref, ref_], axis=1)
                            for ref, ref_ in zip(denoising_bbox,
                                                 reference_points_list)
                        ],
                        axis=1)

                    target_list = paddle.split(
                        target, self.dual_groups + 1, axis=1)
                    target = paddle.concat(
                        [
                            paddle.concat(
                                [tgt, tgt_], axis=1)
                            for tgt, tgt_ in zip(denoising_class, target_list)
                        ],
                        axis=1)
                else:
                    reference_points, target = reference_points, target
            else:
                reference_points = paddle.concat(
                    [denoising_bbox, reference_points], 1)
                target = paddle.concat([denoising_class, target], 1)

        return target, reference_points, enc_topk_bboxes, enc_topk_logits


================================================
FILE: ppdet/modeling/transformers/hybrid_encoder.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import get_act_fn
from ..shape_spec import ShapeSpec
from ..backbones.csp_darknet import BaseConv
from ..backbones.cspresnet import RepVggBlock
from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
from ..initializer import xavier_uniform_, linear_init_
from ..layers import MultiHeadAttention
from paddle import ParamAttr
from paddle.regularizer import L2Decay

__all__ = ['HybridEncoder', 'MaskHybridEncoder']


class CSPRepLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 num_blocks=3,
                 expansion=1.0,
                 bias=False,
                 act="silu"):
        super(CSPRepLayer, self).__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.conv2 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.bottlenecks = nn.Sequential(* [
            RepVggBlock(
                hidden_channels, hidden_channels, act=act)
            for _ in range(num_blocks)
        ])
        if hidden_channels != out_channels:
            self.conv3 = BaseConv(
                hidden_channels,
                out_channels,
                ksize=1,
                stride=1,
                bias=bias,
                act=act)
        else:
            self.conv3 = nn.Identity()

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        return self.conv3(x_1 + x_2)


@register
class TransformerLayer(nn.Layer):
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(TransformerLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self, src, src_mask=None, pos_embed=None):
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        q = k = self.with_pos_embed(src, pos_embed)
        src = self.self_attn(q, k, value=src, attn_mask=src_mask)

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)
        return src


@register
@serializable
class HybridEncoder(nn.Layer):
    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
    __inject__ = ['encoder_layer']

    def __init__(self,
                 in_channels=[512, 1024, 2048],
                 feat_strides=[8, 16, 32],
                 hidden_dim=256,
                 use_encoder_idx=[2],
                 num_encoder_layers=1,
                 encoder_layer='TransformerLayer',
                 pe_temperature=10000,
                 expansion=1.0,
                 depth_mult=1.0,
                 act='silu',
                 trt=False,
                 eval_size=None):
        super(HybridEncoder, self).__init__()
        self.in_channels = in_channels
        self.feat_strides = feat_strides
        self.hidden_dim = hidden_dim
        self.use_encoder_idx = use_encoder_idx
        self.num_encoder_layers = num_encoder_layers
        self.pe_temperature = pe_temperature
        self.eval_size = eval_size

        # channel projection
        self.input_proj = nn.LayerList()
        for in_channel in in_channels:
            self.input_proj.append(
                nn.Sequential(
                    nn.Conv2D(
                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),
                    nn.BatchNorm2D(
                        hidden_dim,
                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
        # encoder transformer
        self.encoder = nn.LayerList([
            TransformerEncoder(encoder_layer, num_encoder_layers)
            for _ in range(len(use_encoder_idx))
        ])

        act = get_act_fn(
            act, trt=trt) if act is None or isinstance(act,
                                                       (str, dict)) else act
        # top-down fpn
        self.lateral_convs = nn.LayerList()
        self.fpn_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.lateral_convs.append(
                BaseConv(
                    hidden_dim, hidden_dim, 1, 1, act=act))
            self.fpn_blocks.append(
                CSPRepLayer(
                    hidden_dim * 2,
                    hidden_dim,
                    round(3 * depth_mult),
                    act=act,
                    expansion=expansion))

        # bottom-up pan
        self.downsample_convs = nn.LayerList()
        self.pan_blocks = nn.LayerList()
        for idx in range(len(in_channels) - 1):
            self.downsample_convs.append(
                BaseConv(
                    hidden_dim, hidden_dim, 3, stride=2, act=act))
            self.pan_blocks.append(
                CSPRepLayer(
                    hidden_dim * 2,
                    hidden_dim,
                    round(3 * depth_mult),
                    act=act,
                    expansion=expansion))

        self._reset_parameters()

    def _reset_parameters(self):
        if self.eval_size:
            for idx in self.use_encoder_idx:
                stride = self.feat_strides[idx]
                pos_embed = self.build_2d_sincos_position_embedding(
                    self.eval_size[1] // stride, self.eval_size[0] // stride,
                    self.hidden_dim, self.pe_temperature)
                setattr(self, f'pos_embed{idx}', pos_embed)

    @staticmethod
    def build_2d_sincos_position_embedding(w,
                                           h,
                                           embed_dim=256,
                                           temperature=10000.):
        grid_w = paddle.arange(int(w), dtype=paddle.float32)
        grid_h = paddle.arange(int(h), dtype=paddle.float32)
        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
        assert embed_dim % 4 == 0, \
            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
        pos_dim = embed_dim // 4
        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
        omega = 1. / (temperature**omega)

        out_w = grid_w.flatten()[..., None] @omega[None]
        out_h = grid_h.flatten()[..., None] @omega[None]

        return paddle.concat(
            [
                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
                paddle.cos(out_h)
            ],
            axis=1)[None, :, :]

    def forward(self, feats, for_mot=False, is_teacher=False):
        assert len(feats) == len(self.in_channels)
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        # encoder
        if self.num_encoder_layers > 0:
            for i, enc_ind in enumerate(self.use_encoder_idx):
                h, w = proj_feats[enc_ind].shape[2:]
                # flatten [B, C, H, W] to [B, HxW, C]
                src_flatten = proj_feats[enc_ind].flatten(2).transpose(
                    [0, 2, 1])
                if self.training or self.eval_size is None or is_teacher:
                    pos_embed = self.build_2d_sincos_position_embedding(
                        w, h, self.hidden_dim, self.pe_temperature)
                else:
                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
                    [-1, self.hidden_dim, h, w])

        # top-down fpn
        inner_outs = [proj_feats[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = proj_feats[idx - 1]
            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
                feat_heigh)
            inner_outs[0] = feat_heigh
            upsample_feat = F.interpolate(
                feat_heigh, scale_factor=2., mode="nearest")
            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                paddle.concat(
                    [upsample_feat, feat_low], axis=1))
            inner_outs.insert(0, inner_out)

        # bottom-up pan
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsample_convs[idx](feat_low)
            out = self.pan_blocks[idx](paddle.concat(
                [downsample_feat, feat_height], axis=1))
            outs.append(out)

        return outs

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {
            'in_channels': [i.channels for i in input_shape],
            'feat_strides': [i.stride for i in input_shape]
        }

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=self.hidden_dim, stride=self.feat_strides[idx])
            for idx in range(len(self.in_channels))
        ]


class MaskFeatFPN(nn.Layer):
    def __init__(self,
                 in_channels=[256, 256, 256],
                 fpn_strides=[32, 16, 8],
                 feat_channels=256,
                 dropout_ratio=0.0,
                 out_channels=256,
                 align_corners=False,
                 act='swish'):
        super(MaskFeatFPN, self).__init__()
        assert len(in_channels) == len(fpn_strides)
        reorder_index = np.argsort(fpn_strides, axis=0)
        in_channels = [in_channels[i] for i in reorder_index]
        fpn_strides = [fpn_strides[i] for i in reorder_index]
        assert min(fpn_strides) == fpn_strides[0]
        self.reorder_index = reorder_index
        self.fpn_strides = fpn_strides
        self.dropout_ratio = dropout_ratio
        self.align_corners = align_corners
        if self.dropout_ratio > 0:
            self.dropout = nn.Dropout2D(dropout_ratio)

        self.scale_heads = nn.LayerList()
        for i in range(len(fpn_strides)):
            head_length = max(
                1, int(np.log2(fpn_strides[i]) - np.log2(fpn_strides[0])))
            scale_head = []
            for k in range(head_length):
                in_c = in_channels[i] if k == 0 else feat_channels
                scale_head.append(
                    nn.Sequential(
                        BaseConv(in_c, feat_channels, 3, 1, act=act))
                )
                if fpn_strides[i] != fpn_strides[0]:
                    scale_head.append(
                        nn.Upsample(
                            scale_factor=2,
                            mode='bilinear',
                            align_corners=align_corners))

            self.scale_heads.append(nn.Sequential(*scale_head))

        self.output_conv = BaseConv(
            feat_channels, out_channels, 3, 1, act=act)

    def forward(self, inputs):
        x = [inputs[i] for i in self.reorder_index]

        output = self.scale_heads[0](x[0])
        for i in range(1, len(self.fpn_strides)):
            output = output + F.interpolate(
                self.scale_heads[i](x[i]),
                size=output.shape[2:],
                mode='bilinear',
                align_corners=self.align_corners)

        if self.dropout_ratio > 0:
            output = self.dropout(output)
        output = self.output_conv(output)
        return output


@register
@serializable
class MaskHybridEncoder(HybridEncoder):
    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size', 'num_prototypes']
    __inject__ = ['encoder_layer']

    def __init__(self,
                 in_channels=[256, 512, 1024, 2048],
                 feat_strides=[4, 8, 16, 32],
                 hidden_dim=256,
                 use_encoder_idx=[3],
                 num_encoder_layers=1,
                 encoder_layer='TransformerLayer',
                 num_prototypes=32,
                 pe_temperature=10000,
                 expansion=1.0,
                 depth_mult=1.0,
                 mask_feat_channels=[64, 64],
                 act='silu',
                 trt=False,
                 eval_size=None):
        assert len(in_channels) == len(feat_strides)
        x4_feat_dim = in_channels.pop(0)
        x4_feat_stride = feat_strides.pop(0)
        use_encoder_idx = [i - 1 for i in use_encoder_idx]
        assert x4_feat_stride == 4

        super(MaskHybridEncoder, self).__init__(
            in_channels=in_channels,
            feat_strides=feat_strides,
            hidden_dim=hidden_dim,
            use_encoder_idx=use_encoder_idx,
            num_encoder_layers=num_encoder_layers,
            encoder_layer=encoder_layer,
            pe_temperature=pe_temperature,
            expansion=expansion,
            depth_mult=depth_mult,
            act=act,
            trt=trt,
            eval_size=eval_size)

        self.mask_feat_head = MaskFeatFPN(
            [hidden_dim] * len(feat_strides),
            feat_strides,
            feat_channels=mask_feat_channels[0],
            out_channels=mask_feat_channels[1],
            act=act)
        self.enc_mask_lateral = BaseConv(
            x4_feat_dim, mask_feat_channels[1], 3, 1, act=act)
        self.enc_mask_output = nn.Sequential(
            BaseConv(
                mask_feat_channels[1],
                mask_feat_channels[1], 3, 1, act=act),
            nn.Conv2D(mask_feat_channels[1], num_prototypes, 1))

    def forward(self, feats, for_mot=False, is_teacher=False):
        x4_feat = feats.pop(0)

        enc_feats = super(MaskHybridEncoder, self).forward(
            feats, for_mot=for_mot, is_teacher=is_teacher)

        mask_feat = self.mask_feat_head(enc_feats)
        mask_feat = F.interpolate(
            mask_feat,
            scale_factor=2,
            mode='bilinear',
            align_corners=False)
        mask_feat += self.enc_mask_lateral(x4_feat)
        mask_feat = self.enc_mask_output(mask_feat)

        return enc_feats, mask_feat


================================================
FILE: ppdet/modeling/transformers/mask_dino_transformer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from .position_encoding import PositionEmbedding
from ..heads.detr_head import MLP
from .deformable_transformer import (DeformableTransformerEncoderLayer,
                                     DeformableTransformerEncoder)
from .dino_transformer import (DINOTransformerDecoderLayer)
from ..initializer import (linear_init_, constant_, xavier_uniform_,
                           bias_init_with_prob)
from .utils import (_get_clones, get_valid_ratio, get_denoising_training_group,
                    get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate)

__all__ = ['MaskDINO']


class ConvGNBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 groups=1,
                 num_groups=32,
                 bias=False,
                 act=None):
        super(ConvGNBlock, self).__init__()
        self.conv = nn.Conv2D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2,
            groups=groups,
            bias_attr=bias)
        self.norm = nn.GroupNorm(
            num_groups,
            out_channels,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self.act = getattr(F, act) if act is not None else None

        self._init_weights()

    def _init_weights(self):
        xavier_uniform_(self.conv.weight)

    def forward(self, x):
        x = self.norm(self.conv(x))
        if self.act is not None:
            x = self.act(x)
        return x


class MaskDINOTransformerDecoder(nn.Layer):
    def __init__(self, hidden_dim, decoder_layer, num_layers):
        super(MaskDINOTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self,
                tgt,
                ref_points_unact,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                bbox_head,
                query_pos_head,
                dec_norm,
                valid_ratios=None,
                attn_mask=None,
                memory_mask=None):
        if valid_ratios is None:
            valid_ratios = paddle.ones(
                [memory.shape[0], memory_spatial_shapes.shape[0], 2])

        output = tgt
        intermediate = []
        inter_bboxes = []
        ref_points = F.sigmoid(ref_points_unact)
        for i, layer in enumerate(self.layers):
            reference_points_input = ref_points.detach().unsqueeze(
                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
            query_pos_embed = get_sine_pos_embed(
                reference_points_input[..., 0, :], self.hidden_dim // 2)
            query_pos_embed = query_pos_head(query_pos_embed)

            output = layer(output, reference_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)

            ref_points = F.sigmoid(
                bbox_head(output) + inverse_sigmoid(ref_points.detach()))

            intermediate.append(dec_norm(output))
            inter_bboxes.append(ref_points)

        return paddle.stack(intermediate), paddle.stack(inter_bboxes)


@register
class MaskDINO(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=300,
                 position_embed_type='sine',
                 in_feats_channel=[256, 512, 1024, 2048],
                 num_levels=3,
                 num_encoder_points=4,
                 num_decoder_points=4,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=9,
                 enc_dim_feedforward=1024,
                 dec_dim_feedforward=2048,
                 dropout=0.,
                 activation="relu",
                 lr_mult=1.0,
                 pe_temperature=10000,
                 pe_offset=-0.5,
                 num_denoising=100,
                 label_noise_ratio=0.4,
                 box_noise_scale=0.4,
                 learnt_init_query=False,
                 mask_enhanced=True,
                 eps=1e-2):
        super(MaskDINO, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        feat0_dim = in_feats_channel.pop(0)
        assert len(in_feats_channel) <= num_levels

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.mask_enhanced = mask_enhanced

        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
        # backbone feature projection
        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)

        # Transformer module
        encoder_layer = DeformableTransformerEncoderLayer(
            hidden_dim, nhead, enc_dim_feedforward, dropout, activation,
            num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr)
        self.encoder = DeformableTransformerEncoder(encoder_layer,
                                                    num_encoder_layers)
        decoder_layer = DINOTransformerDecoderLayer(
            hidden_dim, nhead, dec_dim_feedforward, dropout, activation,
            num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr)
        self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer,
                                                  num_decoder_layers)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # position embedding
        self.position_embedding = PositionEmbedding(
            hidden_dim // 2,
            temperature=pe_temperature,
            normalize=True if position_embed_type == 'sine' else False,
            embed_type=position_embed_type,
            offset=pe_offset)
        self.level_embed = nn.Embedding(
            num_levels,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(2 * hidden_dim,
                                  hidden_dim,
                                  hidden_dim,
                                  num_layers=2)
        # mask embedding
        self.mask_query_head = MLP(hidden_dim,
                                   hidden_dim,
                                   hidden_dim,
                                   num_layers=3)

        # encoder mask head
        self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1)
        self.enc_mask_output = nn.Sequential(
            ConvGNBlock(
                hidden_dim, hidden_dim, 3, act=activation),
            nn.Conv2D(hidden_dim, hidden_dim, 1))
        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
        # decoder norm layer
        self.dec_norm = nn.LayerNorm(
            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
        # shared prediction head
        self.class_head = nn.Linear(hidden_dim, num_classes)
        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.class_head)
        constant_(self.class_head.bias, bias_cls)
        constant_(self.bbox_head.layers[-1].weight)
        constant_(self.bbox_head.layers[-1].bias)

        xavier_uniform_(self.enc_mask_output[1].weight)
        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'in_feats_channel': [i.channels for i in input_shape], }

    def _build_input_proj_layer(self,
                                in_feats_channel,
                                weight_attr=None,
                                bias_attr=None):
        self.input_proj = nn.LayerList()
        for in_channels in in_feats_channel:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels, self.hidden_dim, kernel_size=1)), (
                            'norm', nn.GroupNorm(
                                32,
                                self.hidden_dim,
                                weight_attr=weight_attr,
                                bias_attr=bias_attr))))
        in_channels = in_feats_channel[-1]
        for _ in range(self.num_levels - len(in_feats_channel)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1)), ('norm', nn.GroupNorm(
                            32,
                            self.hidden_dim,
                            weight_attr=weight_attr,
                            bias_attr=bias_attr))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats, pad_mask=None):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        valid_ratios = []
        for i, feat in enumerate(proj_feats):
            bs, _, h, w = feat.shape
            spatial_shapes.append(paddle.concat([h, w]))
            # [b,c,h,w] -> [b,h*w,c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            if pad_mask is not None:
                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
            else:
                mask = paddle.ones([bs, h, w])
            valid_ratios.append(get_valid_ratio(mask))
            # [b, h*w, c]
            pos_embed = self.position_embedding(mask).flatten(1, 2)
            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            if pad_mask is not None:
                # [b, h*w]
                mask_flatten.append(mask.flatten(1))

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        # [b, l]
        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
                                                                   1)
        # [b, l, c]
        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
        # [num_levels, 2]
        spatial_shapes = paddle.to_tensor(
            paddle.stack(spatial_shapes).astype('int64'))
        # [l], 每一个level的起始index
        level_start_index = paddle.concat([
            paddle.zeros(
                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
        ])
        # [b, num_levels, 2]
        valid_ratios = paddle.stack(valid_ratios, 1)
        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
                lvl_pos_embed_flatten, valid_ratios)

    def forward(self, feats, pad_mask=None, gt_meta=None):
        feat0 = feats.pop(0)
        # input projection and embedding
        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
         lvl_pos_embed_flatten,
         valid_ratios) = self._get_encoder_input(feats, pad_mask)

        # encoder
        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)

        mask_feat = self._get_encoder_mask_feature(feat0, memory,
                                                   spatial_shapes)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                get_denoising_training_group(gt_meta,
                                            self.num_classes,
                                            self.num_queries,
                                            self.denoising_class_embed.weight,
                                            self.num_denoising,
                                            self.label_noise_ratio,
                                            self.box_noise_scale)
        else:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None

        target, init_ref_points_unact, enc_out, init_out = \
            self._get_decoder_input(
            memory, mask_feat, spatial_shapes, mask_flatten, denoising_class,
            denoising_bbox_unact)

        # decoder
        inter_feats, inter_bboxes = self.decoder(
            target, init_ref_points_unact, memory, spatial_shapes,
            level_start_index, self.bbox_head, self.query_pos_head,
            self.dec_norm, valid_ratios, attn_mask, mask_flatten)

        out_logits = []
        out_bboxes = []
        out_masks = []
        for i in range(self.num_decoder_layers):
            if self.training or i == self.num_decoder_layers - 1:
                logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i],
                                                                mask_feat)
            else:
                continue
            out_logits.append(logits_)
            out_masks.append(masks_)
            if i == 0:
                out_bboxes.append(
                    F.sigmoid(
                        self.bbox_head(inter_feats[i]) + init_ref_points_unact))
            else:
                out_bboxes.append(
                    F.sigmoid(
                        self.bbox_head(inter_feats[i]) + inverse_sigmoid(
                            inter_bboxes[i - 1])))
        out_bboxes = paddle.stack(out_bboxes)
        out_logits = paddle.stack(out_logits)
        out_masks = paddle.stack(out_masks)

        return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta)

    def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes):
        memory_feat0 = memory.split(
            spatial_shapes.prod(1).split(self.num_levels), axis=1)[0]
        h, w = spatial_shapes[0]
        memory_feat0 = memory_feat0.reshape(
            [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2])
        out = self.enc_mask_lateral(in_feat) + F.interpolate(
            memory_feat0,
            scale_factor=2.0,
            mode='bilinear',
            align_corners=False)
        return self.enc_mask_output(out)

    def _get_encoder_output_anchors(self,
                                    memory,
                                    spatial_shapes,
                                    memory_mask=None,
                                    grid_size=0.05):
        output_anchors = []
        idx = 0
        for lvl, (h, w) in enumerate(spatial_shapes):
            if memory_mask is not None:
                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
                valid_H = paddle.sum(mask_[:, :, 0], 1)
                valid_W = paddle.sum(mask_[:, 0, :], 1)
            else:
                valid_H, valid_W = h, w

            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(end=h), paddle.arange(end=w))
            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)

            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
                [-1, 1, 1, 2]).astype(grid_xy.dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            output_anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
            idx += h * w

        output_anchors = paddle.concat(output_anchors, 1)
        valid_mask = ((output_anchors > self.eps) *
                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
        if memory_mask is not None:
            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
        output_anchors = paddle.where(valid_mask, output_anchors,
                                      paddle.to_tensor(float("inf")))

        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        output_memory = self.enc_output(memory)
        return output_memory, output_anchors

    def _get_decoder_input(self,
                           memory,
                           mask_feat,
                           spatial_shapes,
                           memory_mask=None,
                           denoising_class=None,
                           denoising_bbox_unact=None):
        # prepare input for decoder
        bs, _, _ = memory.shape
        output_memory, output_anchors = self._get_encoder_output_anchors(
            memory, spatial_shapes, memory_mask)
        enc_logits_unact = self.class_head(output_memory)
        enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors

        # get topk index
        _, topk_ind = paddle.topk(
            enc_logits_unact.max(-1), self.num_queries, axis=1)
        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

        # extract content and position query embedding
        target = paddle.gather_nd(output_memory, topk_ind)
        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,
                                                  topk_ind)  # unsigmoided.
        # get encoder output: {logits, bboxes, masks}
        enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target,
                                                                      mask_feat)
        enc_out_bboxes = F.sigmoid(reference_points_unact)
        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)

        # concat denoising query
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        else:
            target = target.detach()
        if denoising_class is not None:
            target = paddle.concat([denoising_class, target], 1)
        if self.mask_enhanced:
            # use mask-enhanced anchor box initialization
            reference_points = mask_to_box_coordinate(
                enc_out_masks > 0, normalize=True, format="xywh")
            reference_points_unact = inverse_sigmoid(reference_points)
        if denoising_bbox_unact is not None:
            reference_points_unact = paddle.concat(
                [denoising_bbox_unact, reference_points_unact], 1)

        # direct prediction from the matching and denoising part in the begining
        if self.training and denoising_class is not None:
            init_out_logits, init_out_masks = self._get_pred_class_and_mask(
                target, mask_feat)
            init_out_bboxes = F.sigmoid(reference_points_unact)
            init_out = (init_out_logits, init_out_bboxes, init_out_masks)
        else:
            init_out = None

        return target, reference_points_unact.detach(), enc_out, init_out

    def _get_pred_class_and_mask(self, query_embed, mask_feat):
        out_query = self.dec_norm(query_embed)
        out_logits = self.class_head(out_query)
        mask_query_embed = self.mask_query_head(out_query)
        _, _, h, w = mask_feat.shape
        # [b, q, c] x [b, c, h, w] -> [b, q, h, w]
        out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape(
            [0, 0, h, w])
        return out_logits, out_mask


================================================
FILE: ppdet/modeling/transformers/mask_rtdetr_transformer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from .rtdetr_transformer import TransformerDecoderLayer
from .utils import (_get_clones, inverse_sigmoid, get_denoising_training_group,
                    mask_to_box_coordinate)
from ..heads.detr_head import MLP
from ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob)

__all__ = ['MaskRTDETR']


def _get_pred_class_and_mask(query_embed,
                             mask_feat,
                             dec_norm,
                             score_head,
                             mask_query_head):
    out_query = dec_norm(query_embed)
    out_logits = score_head(out_query)
    mask_query_embed = mask_query_head(out_query)
    batch_size, mask_dim, _ = mask_query_embed.shape
    _, _, mask_h, mask_w = mask_feat.shape
    out_mask = paddle.bmm(
        mask_query_embed, mask_feat.flatten(2)).reshape(
        [batch_size, mask_dim, mask_h, mask_w])
    return out_logits, out_mask


class MaskTransformerDecoder(nn.Layer):
    def __init__(self,
                 hidden_dim,
                 decoder_layer,
                 num_layers,
                 eval_idx=-1,
                 eval_topk=100):
        super(MaskTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.eval_idx = eval_idx if eval_idx >= 0 \
            else num_layers + eval_idx
        self.eval_topk = eval_topk

    def forward(self,
                tgt,
                ref_points_unact,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                mask_feat,
                bbox_head,
                score_head,
                query_pos_head,
                mask_query_head,
                dec_norm,
                attn_mask=None,
                memory_mask=None,
                query_pos_head_inv_sig=False):
        output = tgt
        dec_out_bboxes = []
        dec_out_logits = []
        dec_out_masks = []
        ref_points_detach = F.sigmoid(ref_points_unact)
        for i, layer in enumerate(self.layers):
            ref_points_input = ref_points_detach.unsqueeze(2)
            if not query_pos_head_inv_sig:
                query_pos_embed = query_pos_head(ref_points_detach)
            else:
                query_pos_embed = query_pos_head(
                    inverse_sigmoid(ref_points_detach))

            output = layer(output, ref_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)

            inter_ref_bbox = F.sigmoid(bbox_head(output) +
                                       inverse_sigmoid(ref_points_detach))

            if self.training:
                logits_, masks_ = _get_pred_class_and_mask(
                    output, mask_feat, dec_norm,
                    score_head, mask_query_head)
                dec_out_logits.append(logits_)
                dec_out_masks.append(masks_)
                if i == 0:
                    dec_out_bboxes.append(inter_ref_bbox)
                else:
                    dec_out_bboxes.append(
                        F.sigmoid(bbox_head(output) +
                                  inverse_sigmoid(ref_points)))
            elif i == self.eval_idx:
                logits_, masks_ = _get_pred_class_and_mask(
                    output, mask_feat, dec_norm,
                    score_head, mask_query_head)
                dec_out_logits.append(logits_)
                dec_out_masks.append(masks_)
                dec_out_bboxes.append(inter_ref_bbox)
                return (paddle.stack(dec_out_bboxes),
                        paddle.stack(dec_out_logits),
                        paddle.stack(dec_out_masks))

            ref_points = inter_ref_bbox
            ref_points_detach = inter_ref_bbox.detach(
            ) if self.training else inter_ref_bbox

        return (paddle.stack(dec_out_bboxes),
                paddle.stack(dec_out_logits),
                paddle.stack(dec_out_masks))


@register
class MaskRTDETR(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim', 'eval_size', 'num_prototypes']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=300,
                 position_embed_type='sine',
                 backbone_feat_channels=[512, 1024, 2048],
                 feat_strides=[8, 16, 32],
                 num_prototypes=32,
                 num_levels=3,
                 num_decoder_points=4,
                 nhead=8,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 num_denoising=100,
                 label_noise_ratio=0.4,
                 box_noise_scale=0.4,
                 learnt_init_query=False,
                 query_pos_head_inv_sig=False,
                 mask_enhanced=True,
                 eval_size=None,
                 eval_idx=-1,
                 eps=1e-2):
        super(MaskRTDETR, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(backbone_feat_channels) <= num_levels
        assert len(feat_strides) == len(backbone_feat_channels)
        for _ in range(num_levels - len(feat_strides)):
            feat_strides.append(feat_strides[-1] * 2)

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.feat_strides = feat_strides
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.mask_enhanced = mask_enhanced
        self.eval_size = eval_size

        # backbone feature projection
        self._build_input_proj_layer(backbone_feat_channels)

        # Transformer module
        decoder_layer = TransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_decoder_points)
        self.decoder = MaskTransformerDecoder(hidden_dim, decoder_layer,
                                              num_decoder_layers, eval_idx)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(4, 2 * hidden_dim,
                                  hidden_dim, num_layers=2)
        self.query_pos_head_inv_sig = query_pos_head_inv_sig

        # mask embedding
        self.mask_query_head = MLP(hidden_dim, hidden_dim,
                                   num_prototypes, num_layers=3)

        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))

        # decoder norm layer
        self.dec_norm = nn.LayerNorm(
            hidden_dim,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # shared prediction head
        self.score_head = nn.Linear(hidden_dim, num_classes)
        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.score_head)
        constant_(self.score_head.bias, bias_cls)
        constant_(self.bbox_head.layers[-1].weight)
        constant_(self.bbox_head.layers[-1].bias)

        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)

        # init encoder output anchors and valid_mask
        if self.eval_size:
            self.anchors, self.valid_mask = self._generate_anchors()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'backbone_feat_channels': [i.channels for i in input_shape],
                'feat_strides': [i.stride for i in input_shape]}

    def _build_input_proj_layer(self, backbone_feat_channels):
        self.input_proj = nn.LayerList()
        for in_channels in backbone_feat_channels:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=1,
                        bias_attr=False)),
                    ('norm', nn.BatchNorm2D(
                        self.hidden_dim,
                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
        in_channels = backbone_feat_channels[-1]
        for _ in range(self.num_levels - len(backbone_feat_channels)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        bias_attr=False)),
                    ('norm', nn.BatchNorm2D(
                        self.hidden_dim,
                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        spatial_shapes = []
        level_start_index = [0, ]
        for i, feat in enumerate(proj_feats):
            _, _, h, w = feat.shape
            # [b, c, h, w] -> [b, h*w, c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            # [num_levels, 2]
            spatial_shapes.append([h, w])
            # [l], start index of each level
            level_start_index.append(h * w + level_start_index[-1])

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        level_start_index.pop()
        return feat_flatten, spatial_shapes, level_start_index

    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
        enc_feats, mask_feat = feats
        # input projection and embedding
        (memory, spatial_shapes,
         level_start_index) = self._get_encoder_input(enc_feats)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                get_denoising_training_group(gt_meta,
                                             self.num_classes,
                                             self.num_queries,
                                             self.denoising_class_embed.weight,
                                             self.num_denoising,
                                             self.label_noise_ratio,
                                             self.box_noise_scale)
        else:
            denoising_class, denoising_bbox_unact,\
                attn_mask, dn_meta = None, None, None, None

        target, init_ref_points_unact, enc_out, init_out = \
            self._get_decoder_input(
                memory, mask_feat, spatial_shapes,
                denoising_class, denoising_bbox_unact, is_teacher)

        # decoder
        out_bboxes, out_logits, out_masks = self.decoder(
            target,
            init_ref_points_unact,
            memory,
            spatial_shapes,
            level_start_index,
            mask_feat,
            self.bbox_head,
            self.score_head,
            self.query_pos_head,
            self.mask_query_head,
            self.dec_norm,
            attn_mask=attn_mask,
            memory_mask=None,
            query_pos_head_inv_sig=self.query_pos_head_inv_sig)

        return out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta

    def _generate_anchors(self,
                          spatial_shapes=None,
                          grid_size=0.05,
                          dtype=paddle.float32):
        if spatial_shapes is None:
            spatial_shapes = [
                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
                for s in self.feat_strides
            ]
        anchors = []
        for lvl, (h, w) in enumerate(spatial_shapes):
            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(
                    end=h, dtype=dtype),
                paddle.arange(
                    end=w, dtype=dtype))
            grid_xy = paddle.stack([grid_x, grid_y], -1)

            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
            anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))

        anchors = paddle.concat(anchors, 1)
        valid_mask = ((anchors > self.eps) *
                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
        anchors = paddle.log(anchors / (1 - anchors))
        anchors = paddle.where(valid_mask, anchors,
                               paddle.to_tensor(float("inf")))
        return anchors, valid_mask

    def _get_decoder_input(self,
                           memory,
                           mask_feat,
                           spatial_shapes,
                           denoising_class=None,
                           denoising_bbox_unact=None,
                           is_teacher=False):
        bs, _, _ = memory.shape
        # prepare input for decoder
        if self.training or self.eval_size is None or is_teacher:
            anchors, valid_mask = self._generate_anchors(spatial_shapes)
        else:
            anchors, valid_mask = self.anchors, self.valid_mask
        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        output_memory = self.enc_output(memory)

        enc_logits_unact = self.score_head(output_memory)
        enc_bboxes_unact = self.bbox_head(output_memory) + anchors

        # get topk index
        _, topk_ind = paddle.topk(
            enc_logits_unact.max(-1), self.num_queries, axis=1)
        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

        # extract content and position query embedding
        target = paddle.gather_nd(output_memory, topk_ind)
        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,
                                                  topk_ind)  # unsigmoided.
        # get encoder output: {logits, bboxes, masks}
        enc_out_logits, enc_out_masks = _get_pred_class_and_mask(
            target, mask_feat, self.dec_norm,
            self.score_head, self.mask_query_head)
        enc_out_bboxes = F.sigmoid(reference_points_unact)
        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)

        # concat denoising query
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        else:
            target = target.detach()
        if denoising_class is not None:
            target = paddle.concat([denoising_class, target], 1)
        if self.mask_enhanced:
            # use mask-enhanced anchor box initialization
            reference_points = mask_to_box_coordinate(
                enc_out_masks > 0, normalize=True, format="xywh")
            reference_points_unact = inverse_sigmoid(reference_points)
        if denoising_bbox_unact is not None:
            reference_points_unact = paddle.concat(
                [denoising_bbox_unact, reference_points_unact], 1)

        # direct prediction from the matching and denoising part in the beginning
        if self.training and denoising_class is not None:
            init_out_logits, init_out_masks = _get_pred_class_and_mask(
                target, mask_feat, self.dec_norm,
                self.score_head, self.mask_query_head)
            init_out_bboxes = F.sigmoid(reference_points_unact)
            init_out = (init_out_logits, init_out_bboxes, init_out_masks)
        else:
            init_out = None

        return target, reference_points_unact.detach(), enc_out, init_out


================================================
FILE: ppdet/modeling/transformers/matchers.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from scipy.optimize import linear_sum_assignment

from ppdet.core.workspace import register, serializable
from ..losses.iou_loss import GIoULoss
from .utils import bbox_cxcywh_to_xyxy

__all__ = ['HungarianMatcher']


@register
@serializable
class HungarianMatcher(nn.Layer):
    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']

    def __init__(self,
                 matcher_coeff={
                     'class': 1,
                     'bbox': 5,
                     'giou': 2,
                     'mask': 1,
                     'dice': 1
                 },
                 use_focal_loss=False,
                 with_mask=False,
                 num_sample_points=12544,
                 alpha=0.25,
                 gamma=2.0):
        r"""
        Args:
            matcher_coeff (dict): The coefficient of hungarian matcher cost.
        """
        super(HungarianMatcher, self).__init__()
        self.matcher_coeff = matcher_coeff
        self.use_focal_loss = use_focal_loss
        self.with_mask = with_mask
        self.num_sample_points = num_sample_points
        self.alpha = alpha
        self.gamma = gamma

        self.giou_loss = GIoULoss()

    def forward(self,
                boxes,
                logits,
                gt_bbox,
                gt_class,
                masks=None,
                gt_mask=None):
        r"""
        Args:
            boxes (Tensor): [b, query, 4]
            logits (Tensor): [b, query, num_classes]
            gt_bbox (List(Tensor)): list[[n, 4]]
            gt_class (List(Tensor)): list[[n, 1]]
            masks (Tensor|None): [b, query, h, w]
            gt_mask (List(Tensor)): list[[n, H, W]]

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = boxes.shape[:2]

        num_gts = [len(a) for a in gt_class]
        if sum(num_gts) == 0:
            return [(paddle.to_tensor(
                [], dtype=paddle.int64), paddle.to_tensor(
                    [], dtype=paddle.int64)) for _ in range(bs)]

        # We flatten to compute the cost matrices in a batch
        # [batch_size * num_queries, num_classes]
        logits = logits.detach()
        out_prob = F.sigmoid(logits.flatten(
            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
        # [batch_size * num_queries, 4]
        out_bbox = boxes.detach().flatten(0, 1)

        # Also concat the target labels and boxes
        tgt_ids = paddle.concat(gt_class).flatten()
        tgt_bbox = paddle.concat(gt_bbox)

        # Compute the classification cost
        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
        if self.use_focal_loss:
            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
                1 - out_prob + 1e-8).log())
            pos_cost_class = self.alpha * (
                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
            cost_class = pos_cost_class - neg_cost_class
        else:
            cost_class = -out_prob

        # Compute the L1 cost between boxes
        cost_bbox = (
            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)

        # Compute the giou cost betwen boxes
        giou_loss = self.giou_loss(
            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
        cost_giou = giou_loss - 1

        # Final cost matrix
        C = self.matcher_coeff['class'] * cost_class + \
            self.matcher_coeff['bbox'] * cost_bbox + \
            self.matcher_coeff['giou'] * cost_giou
        # Compute the mask cost and dice cost
        if self.with_mask:
            assert (masks is not None and gt_mask is not None,
                    'Make sure the input has `mask` and `gt_mask`')
            # all masks share the same set of points for efficient matching
            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
            sample_points = 2.0 * sample_points - 1.0

            out_mask = F.grid_sample(
                masks.detach(), sample_points, align_corners=False).squeeze(-2)
            out_mask = out_mask.flatten(0, 1)

            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
            sample_points = paddle.concat([
                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
                if b > 0
            ])
            tgt_mask = F.grid_sample(
                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])

            with paddle.amp.auto_cast(enable=False):
                # binary cross entropy cost
                pos_cost_mask = F.binary_cross_entropy_with_logits(
                    out_mask, paddle.ones_like(out_mask), reduction='none')
                neg_cost_mask = F.binary_cross_entropy_with_logits(
                    out_mask, paddle.zeros_like(out_mask), reduction='none')
                cost_mask = paddle.matmul(
                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)
                cost_mask /= self.num_sample_points

                # dice cost
                out_mask = F.sigmoid(out_mask)
                numerator = 2 * paddle.matmul(
                    out_mask, tgt_mask, transpose_y=True)
                denominator = out_mask.sum(
                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
                cost_dice = 1 - (numerator + 1) / (denominator + 1)

                C = C + self.matcher_coeff['mask'] * cost_mask + \
                    self.matcher_coeff['dice'] * cost_dice

        C = C.reshape([bs, num_queries, -1])
        C = [a.squeeze(0) for a in C.chunk(bs)]
        sizes = [a.shape[0] for a in gt_bbox]
        if hasattr(paddle.Tensor, "contiguous"):
            indices = [
                linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy())
                for i, c in enumerate(C)
            ]
        else:
            indices = [
                linear_sum_assignment(c.split(sizes, -1)[i].numpy())
                for i, c in enumerate(C)
            ]
        return [(paddle.to_tensor(
            i, dtype=paddle.int64), paddle.to_tensor(
                j, dtype=paddle.int64)) for i, j in indices]


================================================
FILE: ppdet/modeling/transformers/petr_transformer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention, _convert_attention_mask
from .utils import _get_clones
from ..initializer import linear_init_, normal_, constant_, xavier_uniform_

__all__ = [
    'PETRTransformer', 'MultiScaleDeformablePoseAttention',
    'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder',
    'PETR_DeformableDetrTransformerDecoder',
    'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer',
    'TransformerEncoder', 'MSDeformableAttention'
]


def masked_fill(x, mask, value):
    y = paddle.full(x.shape, value, x.dtype)
    return paddle.where(mask, y, x)


def inverse_sigmoid(x, eps=1e-5):
    """Inverse function of sigmoid.

    Args:
        x (Tensor): The tensor to do the
            inverse.
        eps (float): EPS avoid numerical
            overflow. Defaults 1e-5.
    Returns:
        Tensor: The x has passed the inverse
            function of sigmoid, has same
            shape with input.
    """
    x = x.clip(min=0, max=1)
    x1 = x.clip(min=eps)
    x2 = (1 - x).clip(min=eps)
    return paddle.log(x1 / x2)


@register
class TransformerEncoderLayer(nn.Layer):
    __inject__ = ['attn']

    def __init__(self,
                 d_model,
                 attn=None,
                 nhead=8,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(TransformerEncoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.embed_dims = d_model

        if attn is None:
            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        else:
            self.self_attn = attn
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        q = k = self.with_pos_embed(src, pos_embed)
        src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs)

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)
        return src


@register
class TransformerEncoder(nn.Layer):
    __inject__ = ['encoder_layer']

    def __init__(self, encoder_layer, num_layers, norm=None):
        super(TransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm
        self.embed_dims = encoder_layer.embed_dims

    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
        output = src
        for layer in self.layers:
            output = layer(
                output, src_mask=src_mask, pos_embed=pos_embed, **kwargs)

        if self.norm is not None:
            output = self.norm(output)

        return output


@register
class MSDeformableAttention(nn.Layer):
    def __init__(self,
                 embed_dim=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 lr_mult=0.1):
        """
        Multi-Scale Deformable Attention Module
        """
        super(MSDeformableAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_levels = num_levels
        self.num_points = num_points
        self.total_points = num_heads * num_levels * num_points

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.sampling_offsets = nn.Linear(
            embed_dim,
            self.total_points * 2,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=ParamAttr(learning_rate=lr_mult))

        self.attention_weights = nn.Linear(embed_dim, self.total_points)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)
        try:
            # use cuda op
            print("use deformable_detr_ops in ms_deformable_attn")
            from deformable_detr_ops import ms_deformable_attn
        except:
            # use paddle func
            from .utils import deformable_attention_core_func as ms_deformable_attn
        self.ms_deformable_attn_core = ms_deformable_attn

        self._reset_parameters()

    def _reset_parameters(self):
        # sampling_offsets
        constant_(self.sampling_offsets.weight)
        thetas = paddle.arange(
            self.num_heads,
            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
            [1, self.num_levels, self.num_points, 1])
        scaling = paddle.arange(
            1, self.num_points + 1,
            dtype=paddle.float32).reshape([1, 1, -1, 1])
        grid_init *= scaling
        self.sampling_offsets.bias.set_value(grid_init.flatten())
        # attention_weights
        constant_(self.attention_weights.weight)
        constant_(self.attention_weights.bias)
        # proj
        xavier_uniform_(self.value_proj.weight)
        constant_(self.value_proj.bias)
        xavier_uniform_(self.output_proj.weight)
        constant_(self.output_proj.bias)

    def forward(self,
                query,
                key,
                value,
                reference_points,
                value_spatial_shapes,
                value_level_start_index,
                attn_mask=None,
                **kwargs):
        """
        Args:
            query (Tensor): [bs, query_length, C]
            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (Tensor): [bs, value_length, C]
            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
            attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, Len_q = query.shape[:2]
        Len_v = value.shape[1]
        assert int(value_spatial_shapes.prod(1).sum()) == Len_v

        value = self.value_proj(value)
        if attn_mask is not None:
            attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1)
            value *= attn_mask
        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])

        sampling_offsets = self.sampling_offsets(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
        attention_weights = self.attention_weights(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
        attention_weights = F.softmax(attention_weights).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])

        if reference_points.shape[-1] == 2:
            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
                [1, 1, 1, self.num_levels, 1, 2])
            sampling_locations = reference_points.reshape([
                bs, Len_q, 1, self.num_levels, 1, 2
            ]) + sampling_offsets / offset_normalizer
        elif reference_points.shape[-1] == 4:
            sampling_locations = (
                reference_points[:, :, None, :, None, :2] + sampling_offsets /
                self.num_points * reference_points[:, :, None, :, None, 2:] *
                0.5)
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".
                format(reference_points.shape[-1]))

        output = self.ms_deformable_attn_core(
            value, value_spatial_shapes, value_level_start_index,
            sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output


@register
class MultiScaleDeformablePoseAttention(nn.Layer):
    """An attention module used in PETR. `End-to-End Multi-Person
    Pose Estimation with Transformers`.

    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 8.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 17.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_residual`.
            Default: 0.1.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=17,
                 im2col_step=64,
                 dropout=0.1,
                 norm_cfg=None,
                 init_cfg=None,
                 batch_first=False,
                 lr_mult=0.1):
        super().__init__()
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.init_cfg = init_cfg
        self.dropout = nn.Dropout(dropout)
        self.batch_first = batch_first

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn("You'd better set embed_dims in "
                          'MultiScaleDeformAttention to make '
                          'the dimension of each attention head a power of 2 '
                          'which is more efficient in our CUDA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.sampling_offsets = nn.Linear(
            embed_dims,
            num_heads * num_levels * num_points * 2,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=ParamAttr(learning_rate=lr_mult))
        self.attention_weights = nn.Linear(embed_dims,
                                           num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dims, embed_dims)
        self.output_proj = nn.Linear(embed_dims, embed_dims)

        try:
            # use cuda op
            from deformable_detr_ops import ms_deformable_attn
        except:
            # use paddle func
            from .utils import deformable_attention_core_func as ms_deformable_attn
        self.ms_deformable_attn_core = ms_deformable_attn

        self.init_weights()

    def init_weights(self):
        """Default initialization for Parameters of Module."""
        constant_(self.sampling_offsets.weight)
        constant_(self.sampling_offsets.bias)
        constant_(self.attention_weights.weight)
        constant_(self.attention_weights.bias)
        xavier_uniform_(self.value_proj.weight)
        constant_(self.value_proj.bias)
        xavier_uniform_(self.output_proj.weight)
        constant_(self.output_proj.bias)

    def forward(self,
                query,
                key,
                value,
                residual=None,
                attn_mask=None,
                reference_points=None,
                value_spatial_shapes=None,
                value_level_start_index=None,
                **kwargs):
        """Forward Function of MultiScaleDeformAttention.

        Args:
            query (Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (Tensor): The key tensor with shape (num_key, bs, embed_dims).
            value (Tensor): The value tensor with shape
                (num_key, bs, embed_dims).
            residual (Tensor): The tensor used for addition, with the
                same shape as `x`. Default None. If None, `x` will be used.
            reference_points (Tensor):  The normalized reference points with
                shape (bs, num_query, num_levels, K*2), all elements is range
                in [0, 1], top-left (0,0), bottom-right (1, 1), including
                padding area.
            attn_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            value_spatial_shapes (Tensor): Spatial shape of features in
                different level. With shape  (num_levels, 2),
                last dimension represent (h, w).
            value_level_start_index (Tensor): The start index of each level.
                A tensor has shape (num_levels) and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].

        Returns:
            Tensor: forwarded results with shape [num_query, bs, embed_dims].
        """

        if key is None:
            key = query
        if value is None:
            value = key

        bs, num_query, _ = query.shape
        bs, num_key, _ = value.shape
        assert (value_spatial_shapes[:, 0].numpy() *
                value_spatial_shapes[:, 1].numpy()).sum() == num_key

        value = self.value_proj(value)
        if attn_mask is not None:
            # value = value.masked_fill(attn_mask[..., None], 0.0)
            value *= attn_mask.unsqueeze(-1)
        value = value.reshape([bs, num_key, self.num_heads, -1])
        sampling_offsets = self.sampling_offsets(query).reshape([
            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
        ])
        attention_weights = self.attention_weights(query).reshape(
            [bs, num_query, self.num_heads, self.num_levels * self.num_points])
        attention_weights = F.softmax(attention_weights, axis=-1)

        attention_weights = attention_weights.reshape(
            [bs, num_query, self.num_heads, self.num_levels, self.num_points])
        if reference_points.shape[-1] == self.num_points * 2:
            reference_points_reshape = reference_points.reshape(
                (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2)
            x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True)
            y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True)
            x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True)
            y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True)
            w = paddle.clip(x2 - x1, min=1e-4)
            h = paddle.clip(y2 - y1, min=1e-4)
            wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :]

            sampling_locations = reference_points_reshape \
                                 + sampling_offsets * wh * 0.5
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2K, but get {reference_points.shape[-1]} instead.')

        output = self.ms_deformable_attn_core(
            value, value_spatial_shapes, value_level_start_index,
            sampling_locations, attention_weights)

        output = self.output_proj(output)
        return output


@register
class PETR_TransformerDecoderLayer(nn.Layer):
    __inject__ = ['self_attn', 'cross_attn']

    def __init__(self,
                 d_model,
                 nhead=8,
                 self_attn=None,
                 cross_attn=None,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False):
        super(PETR_TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        if self_attn is None:
            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        else:
            self.self_attn = self_attn
        if cross_attn is None:
            self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
        else:
            self.cross_attn = cross_attn
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)

    @staticmethod
    def with_pos_embed(tensor, pos_embed):
        return tensor if pos_embed is None else tensor + pos_embed

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                pos_embed=None,
                query_pos_embed=None,
                **kwargs):
        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        q = self.with_pos_embed(tgt, query_pos_embed)
        key_tmp = tgt
        # k = self.with_pos_embed(memory, pos_embed)
        tgt = self.cross_attn(
            q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs)
        tgt = residual + self.dropout2(tgt)
        if not self.normalize_before:
            tgt = self.norm2(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm3(tgt)
        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = residual + self.dropout3(tgt)
        if not self.normalize_before:
            tgt = self.norm3(tgt)
        return tgt


@register
class PETR_TransformerDecoder(nn.Layer):
    """Implements the decoder in PETR transformer.

    Args:
        return_intermediate (bool): Whether to return intermediate outputs.
        coder_norm_cfg (dict): Config of last normalization layer. Default：
            `LN`.
    """
    __inject__ = ['decoder_layer']

    def __init__(self,
                 decoder_layer,
                 num_layers,
                 norm=None,
                 return_intermediate=False,
                 num_keypoints=17,
                 **kwargs):
        super(PETR_TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm
        self.return_intermediate = return_intermediate
        self.num_keypoints = num_keypoints

    def forward(self,
                query,
                *args,
                reference_points=None,
                valid_ratios=None,
                kpt_branches=None,
                **kwargs):
        """Forward function for `TransformerDecoder`.

        Args:
            query (Tensor): Input query with shape (num_query, bs, embed_dims).
            reference_points (Tensor): The reference points of offset,
                has shape (bs, num_query, K*2).
            valid_ratios (Tensor): The radios of valid points on the feature
                map, has shape (bs, num_levels, 2).
            kpt_branches: (obj:`nn.LayerList`): Used for refining the
                regression results. Only would be passed when `with_box_refine`
                is True, otherwise would be passed a `None`.

        Returns:
            tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when
                return_intermediate is `False`, otherwise it has shape
                [num_layers, num_query, bs, embed_dims] and
                [num_layers, bs, num_query, K*2].
        """
        output = query
        intermediate = []
        intermediate_reference_points = []
        for lid, layer in enumerate(self.layers):
            if reference_points.shape[-1] == self.num_keypoints * 2:
                reference_points_input = \
                    reference_points[:, :, None] * \
                    valid_ratios.tile((1, 1, self.num_keypoints))[:, None]
            else:
                assert reference_points.shape[-1] == 2
                reference_points_input = reference_points[:, :, None] * \
                                         valid_ratios[:, None]
            output = layer(
                output,
                *args,
                reference_points=reference_points_input,
                **kwargs)

            if kpt_branches is not None:
                tmp = kpt_branches[lid](output)
                if reference_points.shape[-1] == self.num_keypoints * 2:
                    new_reference_points = tmp + inverse_sigmoid(
                        reference_points)
                    new_reference_points = F.sigmoid(new_reference_points)
                else:
                    raise NotImplementedError
                reference_points = new_reference_points.detach()

            if self.return_intermediate:
                intermediate.append(output)
                intermediate_reference_points.append(reference_points)

        if self.return_intermediate:
            return paddle.stack(intermediate), paddle.stack(
                intermediate_reference_points)

        return output, reference_points


@register
class PETR_DeformableTransformerDecoder(nn.Layer):
    __inject__ = ['decoder_layer']

    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
        super(PETR_DeformableTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.return_intermediate = return_intermediate

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_mask=None,
                query_pos_embed=None):
        output = tgt
        intermediate = []
        for lid, layer in enumerate(self.layers):
            output = layer(output, reference_points, memory,
                           memory_spatial_shapes, memory_mask, query_pos_embed)

            if self.return_intermediate:
                intermediate.append(output)

        if self.return_intermediate:
            return paddle.stack(intermediate)

        return output.unsqueeze(0)


@register
class PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder):
    """Implements the decoder in DETR transformer.

    Args:
        return_intermediate (bool): Whether to return intermediate outputs.
        coder_norm_cfg (dict): Config of last normalization layer. Default：
            `LN`.
    """

    def __init__(self, *args, return_intermediate=False, **kwargs):

        super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args,
                                                                    **kwargs)
        self.return_intermediate = return_intermediate

    def forward(self,
                query,
                *args,
                reference_points=None,
                valid_ratios=None,
                reg_branches=None,
                **kwargs):
        """Forward function for `TransformerDecoder`.

        Args:
            query (Tensor): Input query with shape
                `(num_query, bs, embed_dims)`.
            reference_points (Tensor): The reference
                points of offset. has shape
                (bs, num_query, 4) when as_two_stage,
                otherwise has shape ((bs, num_query, 2).
            valid_ratios (Tensor): The radios of valid
                points on the feature map, has shape
                (bs, num_levels, 2)
            reg_branch: (obj:`nn.LayerList`): Used for
                refining the regression results. Only would
                be passed when with_box_refine is True,
                otherwise would be passed a `None`.

        Returns:
            Tensor: Results with shape [1, num_query, bs, embed_dims] when
                return_intermediate is `False`, otherwise it has shape
                [num_layers, num_query, bs, embed_dims].
        """
        output = query
        intermediate = []
        intermediate_reference_points = []
        for lid, layer in enumerate(self.layers):
            if reference_points.shape[-1] == 4:
                reference_points_input = reference_points[:, :, None] * \
                    paddle.concat([valid_ratios, valid_ratios], -1)[:, None]
            else:
                assert reference_points.shape[-1] == 2
                reference_points_input = reference_points[:, :, None] * \
                    valid_ratios[:, None]
            output = layer(
                output,
                *args,
                reference_points=reference_points_input,
                **kwargs)

            if reg_branches is not None:
                tmp = reg_branches[lid](output)
                if reference_points.shape[-1] == 4:
                    new_reference_points = tmp + inverse_sigmoid(
                        reference_points)
                    new_reference_points = F.sigmoid(new_reference_points)
                else:
                    assert reference_points.shape[-1] == 2
                    new_reference_points = tmp
                    new_reference_points[..., :2] = tmp[
                        ..., :2] + inverse_sigmoid(reference_points)
                    new_reference_points = F.sigmoid(new_reference_points)
                reference_points = new_reference_points.detach()

            if self.return_intermediate:
                intermediate.append(output)
                intermediate_reference_points.append(reference_points)

        if self.return_intermediate:
            return paddle.stack(intermediate), paddle.stack(
                intermediate_reference_points)

        return output, reference_points


@register
class PETRTransformer(nn.Layer):
    """Implements the PETR transformer.

    Args:
        as_two_stage (bool): Generate query from encoder features.
            Default: False.
        num_feature_levels (int): Number of feature maps from FPN:
            Default: 4.
        two_stage_num_proposals (int): Number of proposals when set
            `as_two_stage` as True. Default: 300.
    """
    __inject__ = ["encoder", "decoder", "hm_encoder", "refine_decoder"]

    def __init__(self,
                 encoder="",
                 decoder="",
                 hm_encoder="",
                 refine_decoder="",
                 as_two_stage=True,
                 num_feature_levels=4,
                 two_stage_num_proposals=300,
                 num_keypoints=17,
                 **kwargs):
        super(PETRTransformer, self).__init__(**kwargs)
        self.as_two_stage = as_two_stage
        self.num_feature_levels = num_feature_levels
        self.two_stage_num_proposals = two_stage_num_proposals
        self.num_keypoints = num_keypoints
        self.encoder = encoder
        self.decoder = decoder
        self.embed_dims = self.encoder.embed_dims
        self.hm_encoder = hm_encoder
        self.refine_decoder = refine_decoder
        self.init_layers()
        self.init_weights()

    def init_layers(self):
        """Initialize layers of the DeformableDetrTransformer."""
        #paddle.create_parameter
        self.level_embeds = paddle.create_parameter(
            (self.num_feature_levels, self.embed_dims), dtype="float32")

        if self.as_two_stage:
            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
            self.refine_query_embedding = nn.Embedding(self.num_keypoints,
                                                       self.embed_dims * 2)
        else:
            self.reference_points = nn.Linear(self.embed_dims,
                                              2 * self.num_keypoints)

    def init_weights(self):
        """Initialize the transformer weights."""
        for p in self.parameters():
            if p.rank() > 1:
                xavier_uniform_(p)
                if hasattr(p, 'bias') and p.bias is not None:
                    constant_(p.bais)
        for m in self.sublayers():
            if isinstance(m, MSDeformableAttention):
                m._reset_parameters()
        for m in self.sublayers():
            if isinstance(m, MultiScaleDeformablePoseAttention):
                m.init_weights()
        if not self.as_two_stage:
            xavier_uniform_(self.reference_points.weight)
            constant_(self.reference_points.bias)
        normal_(self.level_embeds)
        normal_(self.refine_query_embedding.weight)

    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
                                     spatial_shapes):
        """Generate proposals from encoded memory.

        Args:
            memory (Tensor): The output of encoder, has shape
                (bs, num_key, embed_dim). num_key is equal the number of points
                on feature map from all level.
            memory_padding_mask (Tensor): Padding mask for memory.
                has shape (bs, num_key).
            spatial_shapes (Tensor): The shape of all feature maps.
                has shape (num_level, 2).

        Returns:
            tuple: A tuple of feature map and bbox prediction.

                - output_memory (Tensor): The input of decoder, has shape
                    (bs, num_key, embed_dim). num_key is equal the number of
                    points on feature map from all levels.
                - output_proposals (Tensor): The normalized proposal
                    after a inverse sigmoid, has shape (bs, num_keys, 4).
        """

        N, S, C = memory.shape
        proposals = []
        _cur = 0
        for lvl, (H, W) in enumerate(spatial_shapes):
            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape(
                [N, H, W, 1])
            valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1)
            valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1)

            grid_y, grid_x = paddle.meshgrid(
                paddle.linspace(
                    0, H - 1, H, dtype="float32"),
                paddle.linspace(
                    0, W - 1, W, dtype="float32"))
            grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)],
                                 -1)

            scale = paddle.concat(
                [valid_W.unsqueeze(-1),
                 valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2])
            grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale
            proposal = grid.reshape([N, -1, 2])
            proposals.append(proposal)
            _cur += (H * W)
        output_proposals = paddle.concat(proposals, 1)
        output_proposals_valid = ((output_proposals > 0.01) &
                                  (output_proposals < 0.99)).all(
                                      -1, keepdim=True).astype("bool")
        output_proposals = paddle.log(output_proposals / (1 - output_proposals))
        output_proposals = masked_fill(
            output_proposals, ~memory_padding_mask.astype("bool").unsqueeze(-1),
            float('inf'))
        output_proposals = masked_fill(output_proposals,
                                       ~output_proposals_valid, float('inf'))

        output_memory = memory
        output_memory = masked_fill(
            output_memory, ~memory_padding_mask.astype("bool").unsqueeze(-1),
            float(0))
        output_memory = masked_fill(output_memory, ~output_proposals_valid,
                                    float(0))
        output_memory = self.enc_output_norm(self.enc_output(output_memory))
        return output_memory, output_proposals

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios):
        """Get the reference points used in decoder.

        Args:
            spatial_shapes (Tensor): The shape of all feature maps,
                has shape (num_level, 2).
            valid_ratios (Tensor): The radios of valid points on the
                feature map, has shape (bs, num_levels, 2).

        Returns:
            Tensor: reference points used in decoder, has \
                shape (bs, num_keys, num_levels, 2).
        """
        reference_points_list = []
        for lvl, (H, W) in enumerate(spatial_shapes):
            ref_y, ref_x = paddle.meshgrid(
                paddle.linspace(
                    0.5, H - 0.5, H, dtype="float32"),
                paddle.linspace(
                    0.5, W - 0.5, W, dtype="float32"))
            ref_y = ref_y.reshape(
                (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H)
            ref_x = ref_x.reshape(
                (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W)
            ref = paddle.stack((ref_x, ref_y), -1)
            reference_points_list.append(ref)
        reference_points = paddle.concat(reference_points_list, 1)
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
        return reference_points

    def get_valid_ratio(self, mask):
        """Get the valid radios of feature maps of all level."""
        _, H, W = mask.shape
        valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1)
        valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1)
        valid_ratio_h = valid_H.astype('float') / H
        valid_ratio_w = valid_W.astype('float') / W
        valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)
        return valid_ratio

    def get_proposal_pos_embed(self,
                               proposals,
                               num_pos_feats=128,
                               temperature=10000):
        """Get the position embedding of proposal."""
        scale = 2 * math.pi
        dim_t = paddle.arange(num_pos_feats, dtype="float32")
        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
        # N, L, 4
        proposals = F.sigmoid(proposals) * scale
        # N, L, 4, 128
        pos = proposals[:, :, :, None] / dim_t
        # N, L, 4, 64, 2
        pos = paddle.stack(
            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
            axis=4).flatten(2)
        return pos

    def forward(self,
                mlvl_feats,
                mlvl_masks,
                query_embed,
                mlvl_pos_embeds,
                kpt_branches=None,
                cls_branches=None):
        """Forward function for `Transformer`.

        Args:
            mlvl_feats (list(Tensor)): Input queries from different level.
                Each element has shape [bs, embed_dims, h, w].
            mlvl_masks (list(Tensor)): The key_padding_mask from different
                level used for encoder and decoder, each element has shape
                [bs, h, w].
            query_embed (Tensor): The query embedding for decoder,
                with shape [num_query, c].
            mlvl_pos_embeds (list(Tensor)): The positional encoding
                of feats from different level, has the shape
                 [bs, embed_dims, h, w].
            kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for
                feature maps from each decoder layer. Only would be passed when
                `with_box_refine` is Ture. Default to None.
            cls_branches (obj:`nn.LayerList`): Classification heads for
                feature maps from each decoder layer. Only would be passed when
                `as_two_stage` is Ture. Default to None.

        Returns:
            tuple[Tensor]: results of decoder containing the following tensor.

                - inter_states: Outputs from decoder. If
                    `return_intermediate_dec` is True output has shape \
                    (num_dec_layers, bs, num_query, embed_dims), else has \
                    shape (1, bs, num_query, embed_dims).
                - init_reference_out: The initial value of reference \
                    points, has shape (bs, num_queries, 4).
                - inter_references_out: The internal value of reference \
                    points in decoder, has shape \
                    (num_dec_layers, bs,num_query, embed_dims)
                - enc_outputs_class: The classification score of proposals \
                    generated from encoder's feature maps, has shape \
                    (batch, h*w, num_classes). \
                    Only would be returned when `as_two_stage` is True, \
                    otherwise None.
                - enc_outputs_kpt_unact: The regression results generated from \
                    encoder's feature maps., has shape (batch, h*w, K*2).
                    Only would be returned when `as_two_stage` is True, \
                    otherwise None.
        """
        assert self.as_two_stage or query_embed is not None

        feat_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        spatial_shapes = []
        for lvl, (feat, mask, pos_embed
                  ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
            bs, c, h, w = feat.shape
            spatial_shape = (h, w)
            spatial_shapes.append(spatial_shape)
            feat = feat.flatten(2).transpose((0, 2, 1))
            mask = mask.flatten(1)
            pos_embed = pos_embed.flatten(2).transpose((0, 2, 1))
            lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape(
                [1, 1, -1])
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            feat_flatten.append(feat)
            mask_flatten.append(mask)
        feat_flatten = paddle.concat(feat_flatten, 1)
        mask_flatten = paddle.concat(mask_flatten, 1)
        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
        spatial_shapes_cumsum = paddle.to_tensor(
            np.array(spatial_shapes).prod(1).cumsum(0))
        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
        level_start_index = paddle.concat((paddle.zeros(
            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
        valid_ratios = paddle.stack(
            [self.get_valid_ratio(m) for m in mlvl_masks], 1)

        reference_points = \
            self.get_reference_points(spatial_shapes,
                                      valid_ratios)

        memory = self.encoder(
            src=feat_flatten,
            pos_embed=lvl_pos_embed_flatten,
            src_mask=mask_flatten,
            value_spatial_shapes=spatial_shapes,
            reference_points=reference_points,
            value_level_start_index=level_start_index,
            valid_ratios=valid_ratios)

        bs, _, c = memory.shape

        hm_proto = None
        if self.training:
            hm_memory = paddle.slice(
                memory,
                starts=level_start_index[0],
                ends=level_start_index[1],
                axes=[1])
            hm_pos_embed = paddle.slice(
                lvl_pos_embed_flatten,
                starts=level_start_index[0],
                ends=level_start_index[1],
                axes=[1])
            hm_mask = paddle.slice(
                mask_flatten,
                starts=level_start_index[0],
                ends=level_start_index[1],
                axes=[1])
            hm_reference_points = paddle.slice(
                reference_points,
                starts=level_start_index[0],
                ends=level_start_index[1],
                axes=[1])[:, :, :1, :]

            # official code make a mistake of pos_embed to pose_embed, which disable pos_embed
            hm_memory = self.hm_encoder(
                src=hm_memory,
                pose_embed=hm_pos_embed,
                src_mask=hm_mask,
                value_spatial_shapes=spatial_shapes[[0]],
                reference_points=hm_reference_points,
                value_level_start_index=level_start_index[0],
                valid_ratios=valid_ratios[:, :1, :])
            hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0],
                                           spatial_shapes[0, 1], -1))
            hm_proto = (hm_memory, mlvl_masks[0])

        if self.as_two_stage:
            output_memory, output_proposals = \
                self.gen_encoder_output_proposals(
                    memory, mask_flatten, spatial_shapes)
            enc_outputs_class = cls_branches[self.decoder.num_layers](
                output_memory)
            enc_outputs_kpt_unact = \
                kpt_branches[self.decoder.num_layers](output_memory)
            enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1]
            enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2]

            topk = self.two_stage_num_proposals
            topk_proposals = paddle.topk(
                enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1)

            #paddle.take_along_axis 对应torch.gather
            topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact,
                                                     topk_proposals, 1)
            topk_kpts_unact = topk_kpts_unact.detach()

            reference_points = F.sigmoid(topk_kpts_unact)
            init_reference_out = reference_points
            # learnable query and query_pos
            query_pos, query = paddle.split(
                query_embed, query_embed.shape[1] // c, axis=1)
            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
            query = query.unsqueeze(0).expand((bs, -1, -1))
        else:
            query_pos, query = paddle.split(
                query_embed, query_embed.shape[1] // c, axis=1)
            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
            query = query.unsqueeze(0).expand((bs, -1, -1))
            reference_points = F.sigmoid(self.reference_points(query_pos))
            init_reference_out = reference_points

        # decoder
        inter_states, inter_references = self.decoder(
            query=query,
            memory=memory,
            query_pos_embed=query_pos,
            memory_mask=mask_flatten,
            reference_points=reference_points,
            value_spatial_shapes=spatial_shapes,
            value_level_start_index=level_start_index,
            valid_ratios=valid_ratios,
            kpt_branches=kpt_branches)

        inter_references_out = inter_references
        if self.as_two_stage:
            return inter_states, init_reference_out, \
                   inter_references_out, enc_outputs_class, \
                   enc_outputs_kpt_unact, hm_proto, memory
        return inter_states, init_reference_out, \
               inter_references_out, None, None, None, None, None, hm_proto

    def forward_refine(self,
                       mlvl_masks,
                       memory,
                       reference_points_pose,
                       img_inds,
                       kpt_branches=None,
                       **kwargs):
        mask_flatten = []
        spatial_shapes = []
        for lvl, mask in enumerate(mlvl_masks):
            bs, h, w = mask.shape
            spatial_shape = (h, w)
            spatial_shapes.append(spatial_shape)
            mask = mask.flatten(1)
            mask_flatten.append(mask)
        mask_flatten = paddle.concat(mask_flatten, 1)
        spatial_shapes_cumsum = paddle.to_tensor(
            np.array(
                spatial_shapes, dtype='int64').prod(1).cumsum(0))
        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
        level_start_index = paddle.concat((paddle.zeros(
            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
        valid_ratios = paddle.stack(
            [self.get_valid_ratio(m) for m in mlvl_masks], 1)

        # pose refinement (17 queries corresponding to 17 keypoints)
        # learnable query and query_pos
        refine_query_embedding = self.refine_query_embedding.weight
        query_pos, query = paddle.split(refine_query_embedding, 2, axis=1)
        pos_num = reference_points_pose.shape[0]
        query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1))
        query = query.unsqueeze(0).expand((pos_num, -1, -1))
        reference_points = reference_points_pose.reshape(
            (pos_num, reference_points_pose.shape[1] // 2, 2))
        pos_memory = memory[img_inds]
        mask_flatten = mask_flatten[img_inds]
        valid_ratios = valid_ratios[img_inds]
        if img_inds.size == 1:
            pos_memory = pos_memory.unsqueeze(0)
            mask_flatten = mask_flatten.unsqueeze(0)
            valid_ratios = valid_ratios.unsqueeze(0)
        inter_states, inter_references = self.refine_decoder(
            query=query,
            memory=pos_memory,
            query_pos_embed=query_pos,
            memory_mask=mask_flatten,
            reference_points=reference_points,
            value_spatial_shapes=spatial_shapes,
            value_level_start_index=level_start_index,
            valid_ratios=valid_ratios,
            reg_branches=kpt_branches,
            **kwargs)
        # [num_decoder, num_query, bs, embed_dim]

        init_reference_out = reference_points
        return inter_states, init_reference_out, inter_references


================================================
FILE: ppdet/modeling/transformers/position_encoding.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn

from ppdet.core.workspace import register, serializable


@register
@serializable
class PositionEmbedding(nn.Layer):
    def __init__(self,
                 num_pos_feats=128,
                 temperature=10000,
                 normalize=True,
                 scale=2 * math.pi,
                 embed_type='sine',
                 num_embeddings=50,
                 offset=0.,
                 eps=1e-6):
        super(PositionEmbedding, self).__init__()
        assert embed_type in ['sine', 'learned']

        self.embed_type = embed_type
        self.offset = offset
        self.eps = eps
        if self.embed_type == 'sine':
            self.num_pos_feats = num_pos_feats
            self.temperature = temperature
            self.normalize = normalize
            self.scale = scale
        elif self.embed_type == 'learned':
            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
        else:
            raise ValueError(f"{self.embed_type} is not supported.")

    def forward(self, mask):
        """
        Args:
            mask (Tensor): [B, H, W]
        Returns:
            pos (Tensor): [B, H, W, C]
        """
        if self.embed_type == 'sine':
            y_embed = mask.cumsum(1)
            x_embed = mask.cumsum(2)
            if self.normalize:
                y_embed = (y_embed + self.offset) / (
                    y_embed[:, -1:, :] + self.eps) * self.scale
                x_embed = (x_embed + self.offset) / (
                    x_embed[:, :, -1:] + self.eps) * self.scale

            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
                         2).astype('float32')
            dim_t = self.temperature**(dim_t / self.num_pos_feats)

            pos_x = x_embed.unsqueeze(-1) / dim_t
            pos_y = y_embed.unsqueeze(-1) / dim_t
            pos_x = paddle.stack(
                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
                axis=4).flatten(3)
            pos_y = paddle.stack(
                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
                axis=4).flatten(3)
            return paddle.concat((pos_y, pos_x), axis=3)
        elif self.embed_type == 'learned':
            h, w = mask.shape[-2:]
            i = paddle.arange(w)
            j = paddle.arange(h)
            x_emb = self.col_embed(i)
            y_emb = self.row_embed(j)
            return paddle.concat(
                [
                    x_emb.unsqueeze(0).tile([h, 1, 1]),
                    y_emb.unsqueeze(1).tile([1, w, 1]),
                ],
                axis=-1).unsqueeze(0)
        else:
            raise ValueError(f"not supported {self.embed_type}")


================================================
FILE: ppdet/modeling/transformers/rtdetr_transformer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from ..heads.detr_head import MLP
from .deformable_transformer import MSDeformableAttention
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
                           bias_init_with_prob)
from .utils import (_get_clones, get_sine_pos_embed,
                    get_contrastive_denoising_training_group, inverse_sigmoid)

__all__ = ['RTDETRTransformer']


class PPMSDeformableAttention(MSDeformableAttention):
    def forward(self,
                query,
                reference_points,
                value,
                value_spatial_shapes,
                value_level_start_index,
                value_mask=None):
        """
        Args:
            query (Tensor): [bs, query_length, C]
            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (Tensor): [bs, value_length, C]
            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, Len_q = query.shape[:2]
        Len_v = value.shape[1]

        value = self.value_proj(value)
        if value_mask is not None:
            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
            value *= value_mask
        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])

        sampling_offsets = self.sampling_offsets(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
        attention_weights = self.attention_weights(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
        attention_weights = F.softmax(attention_weights).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])

        if reference_points.shape[-1] == 2:
            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
            offset_normalizer = offset_normalizer.flip([1]).reshape(
                [1, 1, 1, self.num_levels, 1, 2])
            sampling_locations = reference_points.reshape([
                bs, Len_q, 1, self.num_levels, 1, 2
            ]) + sampling_offsets / offset_normalizer
        elif reference_points.shape[-1] == 4:
            sampling_locations = (
                reference_points[:, :, None, :, None, :2] + sampling_offsets /
                self.num_points * reference_points[:, :, None, :, None, 2:] *
                0.5)
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".
                format(reference_points.shape[-1]))

        if not isinstance(query, paddle.Tensor):
            from ppdet.modeling.transformers.utils import deformable_attention_core_func
            output = deformable_attention_core_func(
                value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights)
        else:
            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
            value_level_start_index = paddle.to_tensor(value_level_start_index)
            output = self.ms_deformable_attn_core(
                value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output


class TransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 weight_attr=None,
                 bias_attr=None):
        super(TransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # cross attention
        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
                                                  n_points, 1.0)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
                                 bias_attr)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
                                 bias_attr)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                attn_mask=None,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        if attn_mask is not None:
            attn_mask = paddle.where(
                attn_mask.astype('bool'),
                paddle.zeros(attn_mask.shape, tgt.dtype),
                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_level_start_index, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # ffn
        tgt2 = self.forward_ffn(tgt)
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)

        return tgt


class TransformerDecoder(nn.Layer):
    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
        super(TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

    def forward(self,
                tgt,
                ref_points_unact,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                bbox_head,
                score_head,
                query_pos_head,
                attn_mask=None,
                memory_mask=None,
                query_pos_head_inv_sig=False):
        output = tgt
        dec_out_bboxes = []
        dec_out_logits = []
        ref_points_detach = F.sigmoid(ref_points_unact)
        for i, layer in enumerate(self.layers):
            ref_points_input = ref_points_detach.unsqueeze(2)
            if not query_pos_head_inv_sig:
                query_pos_embed = query_pos_head(ref_points_detach)
            else:
                query_pos_embed = query_pos_head(
                    inverse_sigmoid(ref_points_detach))

            output = layer(output, ref_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)

            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                ref_points_detach))

            if self.training:
                dec_out_logits.append(score_head[i](output))
                if i == 0:
                    dec_out_bboxes.append(inter_ref_bbox)
                else:
                    dec_out_bboxes.append(
                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                            ref_points)))
            elif i == self.eval_idx:
                dec_out_logits.append(score_head[i](output))
                dec_out_bboxes.append(inter_ref_bbox)
                break

            ref_points = inter_ref_bbox
            ref_points_detach = inter_ref_bbox.detach(
            ) if self.training else inter_ref_bbox

        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)


@register
class RTDETRTransformer(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=300,
                 position_embed_type='sine',
                 backbone_feat_channels=[512, 1024, 2048],
                 feat_strides=[8, 16, 32],
                 num_levels=3,
                 num_decoder_points=4,
                 nhead=8,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 num_denoising=100,
                 label_noise_ratio=0.5,
                 box_noise_scale=1.0,
                 learnt_init_query=True,
                 query_pos_head_inv_sig=False,
                 eval_size=None,
                 eval_idx=-1,
                 eps=1e-2):
        super(RTDETRTransformer, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(backbone_feat_channels) <= num_levels
        assert len(feat_strides) == len(backbone_feat_channels)
        for _ in range(num_levels - len(feat_strides)):
            feat_strides.append(feat_strides[-1] * 2)

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.feat_strides = feat_strides
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.eval_size = eval_size

        # backbone feature projection
        self._build_input_proj_layer(backbone_feat_channels)

        # Transformer module
        decoder_layer = TransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_decoder_points)
        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
                                          num_decoder_layers, eval_idx)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
        self.query_pos_head_inv_sig = query_pos_head_inv_sig

        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)

        # decoder head
        self.dec_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(num_decoder_layers)
        ])
        self.dec_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
        constant_(self.enc_bbox_head.layers[-1].weight)
        constant_(self.enc_bbox_head.layers[-1].bias)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight)
            constant_(reg_.layers[-1].bias)

        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)

        # init encoder output anchors and valid_mask
        if self.eval_size:
            self.anchors, self.valid_mask = self._generate_anchors()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'backbone_feat_channels': [i.channels for i in input_shape]}

    def _build_input_proj_layer(self, backbone_feat_channels):
        self.input_proj = nn.LayerList()
        for in_channels in backbone_feat_channels:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
        in_channels = backbone_feat_channels[-1]
        for _ in range(self.num_levels - len(backbone_feat_channels)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        spatial_shapes = []
        level_start_index = [0, ]
        for i, feat in enumerate(proj_feats):
            _, _, h, w = feat.shape
            # [b, c, h, w] -> [b, h*w, c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            # [num_levels, 2]
            spatial_shapes.append([h, w])
            # [l], start index of each level
            level_start_index.append(h * w + level_start_index[-1])

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        level_start_index.pop()
        return (feat_flatten, spatial_shapes, level_start_index)

    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
        # input projection and embedding
        (memory, spatial_shapes,
         level_start_index) = self._get_encoder_input(feats)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                get_contrastive_denoising_training_group(gt_meta,
                                            self.num_classes,
                                            self.num_queries,
                                            self.denoising_class_embed.weight,
                                            self.num_denoising,
                                            self.label_noise_ratio,
                                            self.box_noise_scale)
        else:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None

        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
            self._get_decoder_input(
            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)

        # decoder
        out_bboxes, out_logits = self.decoder(
            target,
            init_ref_points_unact,
            memory,
            spatial_shapes,
            level_start_index,
            self.dec_bbox_head,
            self.dec_score_head,
            self.query_pos_head,
            attn_mask=attn_mask,
            memory_mask=None,
            query_pos_head_inv_sig=self.query_pos_head_inv_sig)
        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
                dn_meta)

    def _generate_anchors(self,
                          spatial_shapes=None,
                          grid_size=0.05,
                          dtype="float32"):
        if spatial_shapes is None:
            spatial_shapes = [
                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
                for s in self.feat_strides
            ]
        anchors = []
        for lvl, (h, w) in enumerate(spatial_shapes):
            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(
                    end=h, dtype=dtype),
                paddle.arange(
                    end=w, dtype=dtype))
            grid_xy = paddle.stack([grid_x, grid_y], -1)

            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))

        anchors = paddle.concat(anchors, 1)
        valid_mask = ((anchors > self.eps) *
                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
        anchors = paddle.log(anchors / (1 - anchors))
        anchors = paddle.where(valid_mask, anchors,
                               paddle.to_tensor(float("inf")))
        return anchors, valid_mask

    def _get_decoder_input(self,
                           memory,
                           spatial_shapes,
                           denoising_class=None,
                           denoising_bbox_unact=None,
                           is_teacher=False):
        bs, _, _ = memory.shape
        # prepare input for decoder
        if self.training or self.eval_size is None or is_teacher:
            anchors, valid_mask = self._generate_anchors(spatial_shapes)
        else:
            anchors, valid_mask = self.anchors, self.valid_mask
        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        output_memory = self.enc_output(memory)

        enc_outputs_class = self.enc_score_head(output_memory)
        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors

        _, topk_ind = paddle.topk(
            enc_outputs_class.max(-1), self.num_queries, axis=1)
        # extract region proposal boxes
        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
                                                  topk_ind)  # unsigmoided.
        enc_topk_bboxes = F.sigmoid(reference_points_unact)
        if denoising_bbox_unact is not None:
            reference_points_unact = paddle.concat(
                [denoising_bbox_unact, reference_points_unact], 1)
        if self.training:
            reference_points_unact = reference_points_unact.detach()
        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)

        # extract region features
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        else:
            target = paddle.gather_nd(output_memory, topk_ind)
            if self.training:
                target = target.detach()
        if denoising_class is not None:
            target = paddle.concat([denoising_class, target], 1)

        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits


================================================
FILE: ppdet/modeling/transformers/rtdetr_transformerv2.py
================================================
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import math

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from .rtdetr_transformer import TransformerDecoder
from .utils import deformable_attention_core_func_v2, get_contrastive_denoising_training_group
from ..heads.detr_head import MLP
from ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob)
from ..layers import MultiHeadAttention

__all__ = ['RTDETRTransformerv2']


class MSDeformableAttention(nn.Layer):
    def __init__(self,
                 embed_dim=256,
                 num_heads=8,
                 num_levels=4,
                 num_points=4,
                 sampling_method='default',
                 offset_scale=0.5,
                 lr_mult=0.1):
        """
        Multi-Scale Deformable Attention Module
        """
        super(MSDeformableAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_levels = num_levels

        if isinstance(num_points, list):
            assert len(num_points) == num_levels, ValueError
            num_points_list = num_points
        else:
            num_points_list = [num_points for _ in range(num_levels)]

        self.num_points_list = num_points_list
        self.total_points = num_heads * sum(num_points_list)

        num_points_scale = [1 / n for n in num_points_list for _ in range(n)]
        self.register_buffer('num_points_scale',
                             paddle.to_tensor(num_points_scale, dtype=paddle.float32))

        self.sampling_method = sampling_method
        self.offset_scale = offset_scale

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.sampling_offsets = nn.Linear(
            embed_dim,
            self.total_points * 2,
            weight_attr=ParamAttr(learning_rate=lr_mult),
            bias_attr=ParamAttr(learning_rate=lr_mult))

        self.attention_weights = nn.Linear(embed_dim, self.total_points)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)

        self.ms_deformable_attn_core = functools.partial(
            deformable_attention_core_func_v2,
            num_points_list=self.num_points_list,
            sampling_method=self.sampling_method)

        self._reset_parameters()

        if self.sampling_method == 'discrete':
            for p in self.sampling_offsets.parameters():
                p.stop_gradient = True

    def _reset_parameters(self):
        # sampling_offsets
        constant_(self.sampling_offsets.weight)
        thetas = paddle.arange(
            self.num_heads,
            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
        grid_init = grid_init.reshape([self.num_heads, 1, 2]).tile(
            [1, sum(self.num_points_list), 1])
        scaling = paddle.concat(
            [paddle.arange(1, n + 1, dtype=paddle.float32)
             for n in self.num_points_list]).reshape([1, -1, 1])
        grid_init *= scaling
        self.sampling_offsets.bias.set_value(grid_init.flatten())
        # attention_weights
        constant_(self.attention_weights.weight)
        constant_(self.attention_weights.bias)
        # proj
        xavier_uniform_(self.value_proj.weight)
        constant_(self.value_proj.bias)
        xavier_uniform_(self.output_proj.weight)
        constant_(self.output_proj.bias)

    def forward(self,
                query,
                reference_points,
                value,
                value_spatial_shapes,
                value_mask=None):
        """
        Args:
            query (Tensor): [batch_num, query_len, num_heads * head_dim]
            reference_points (Tensor): [batch_num, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (Tensor): [batch_num, value_len, num_heads * head_dim]
            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_mask (Tensor): [batch_num, value_len], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        batch_num, query_len = query.shape[:2]
        value_len = value.shape[1]

        value = self.value_proj(value)
        if value_mask is not None:
            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
            value *= value_mask
        value = value.reshape([batch_num, value_len, self.num_heads, self.head_dim])

        sampling_offsets = self.sampling_offsets(query).reshape(
            [batch_num, query_len, self.num_heads, sum(self.num_points_list), 2])
        attention_weights = self.attention_weights(query).reshape(
            [batch_num, query_len, self.num_heads, sum(self.num_points_list)])
        attention_weights = F.softmax(attention_weights, axis=-1)

        if reference_points.shape[-1] == 2:
            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
                [1, 1, 1, self.num_levels, 1, 2])
            sampling_locations = reference_points.reshape([
                batch_num, query_len, 1, self.num_levels, 1, 2
            ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype)
        elif reference_points.shape[-1] == 4:
            offset = sampling_offsets * reference_points[:, :, None, :, 2:]
            num_points_scale = self.num_points_scale.astype(query.dtype).unsqueeze(-1)
            offset = offset * num_points_scale * self.offset_scale
            sampling_locations = reference_points[:, :, None, :, :2] + offset
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".
                format(reference_points.shape[-1]))

        output = self.ms_deformable_attn_core(value, value_spatial_shapes,
                                              sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output


class TransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 sampling_method='default',
                 weight_attr=None,
                 bias_attr=None):
        super(TransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # cross attention
        self.cross_attn = MSDeformableAttention(
            d_model, n_head, n_levels, n_points,
            sampling_method=sampling_method, lr_mult=1.0)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
                                 bias_attr)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
                                 bias_attr)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                attn_mask=None,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        if attn_mask is not None:
            attn_mask = paddle.where(
                attn_mask.astype('bool'),
                paddle.zeros(attn_mask.shape, tgt.dtype),
                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # ffn
        tgt2 = self.forward_ffn(tgt)
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)

        return tgt


@register
class RTDETRTransformerv2(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=300,
                 position_embed_type='sine',
                 backbone_feat_channels=[512, 1024, 2048],
                 feat_strides=[8, 16, 32],
                 num_levels=3,
                 num_decoder_points=4,
                 nhead=8,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 num_denoising=100,
                 label_noise_ratio=0.5,
                 box_noise_scale=1.0,
                 learnt_init_query=True,
                 query_pos_head_inv_sig=False,
                 eval_size=None,
                 eval_idx=-1,
                 eps=1e-2,
                 cross_attn_sampling_method='default'):
        super(RTDETRTransformerv2, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(backbone_feat_channels) <= num_levels
        assert len(feat_strides) == len(backbone_feat_channels)
        for _ in range(num_levels - len(feat_strides)):
            feat_strides.append(feat_strides[-1] * 2)

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.feat_strides = feat_strides
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = num_queries
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.eval_size = eval_size

        assert cross_attn_sampling_method in ['default', 'discrete'], NotImplementedError
        self.cross_attn_sampling_method = cross_attn_sampling_method

        # backbone feature projection
        self._build_input_proj_layer(backbone_feat_channels)

        # Transformer module
        decoder_layer = TransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_decoder_points, sampling_method=cross_attn_sampling_method)
        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
                                          num_decoder_layers, eval_idx)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
        self.query_pos_head_inv_sig = query_pos_head_inv_sig

        # encoder head
        self.enc_output = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)

        # decoder head
        self.dec_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(num_decoder_layers)
        ])
        self.dec_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        linear_init_(self.enc_score_head)
        constant_(self.enc_score_head.bias, bias_cls)
        constant_(self.enc_bbox_head.layers[-1].weight)
        constant_(self.enc_bbox_head.layers[-1].bias)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight)
            constant_(reg_.layers[-1].bias)

        linear_init_(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)

        # init encoder output anchors and valid_mask
        if self.eval_size:
            self.anchors, self.valid_mask = self._generate_anchors()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'backbone_feat_channels': [i.channels for i in input_shape]}

    def _build_input_proj_layer(self, backbone_feat_channels):
        self.input_proj = nn.LayerList()
        for in_channels in backbone_feat_channels:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
        in_channels = backbone_feat_channels[-1]
        for _ in range(self.num_levels - len(backbone_feat_channels)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        spatial_shapes = []
        level_start_index = [0, ]
        for i, feat in enumerate(proj_feats):
            _, _, h, w = feat.shape
            # [b, c, h, w] -> [b, h*w, c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            # [num_levels, 2]
            spatial_shapes.append([h, w])
            # [l], start index of each level
            level_start_index.append(h * w + level_start_index[-1])

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        level_start_index.pop()
        return (feat_flatten, spatial_shapes, level_start_index)

    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
        # input projection and embedding
        (memory, spatial_shapes,
         level_start_index) = self._get_encoder_input(feats)

        # prepare denoising training
        if self.training:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                get_contrastive_denoising_training_group(gt_meta,
                                            self.num_classes,
                                            self.num_queries,
                                            self.denoising_class_embed.weight,
                                            self.num_denoising,
                                            self.label_noise_ratio,
                                            self.box_noise_scale)
        else:
            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None

        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
            self._get_decoder_input(
            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)

        # decoder
        out_bboxes, out_logits = self.decoder(
            target,
            init_ref_points_unact,
            memory,
            spatial_shapes,
            level_start_index,
            self.dec_bbox_head,
            self.dec_score_head,
            self.query_pos_head,
            attn_mask=attn_mask,
            memory_mask=None,
            query_pos_head_inv_sig=self.query_pos_head_inv_sig)
        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
                dn_meta)

    def _generate_anchors(self,
                          spatial_shapes=None,
                          grid_size=0.05,
                          dtype="float32"):
        if spatial_shapes is None:
            spatial_shapes = [
                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
                for s in self.feat_strides
            ]
        anchors = []
        for lvl, (h, w) in enumerate(spatial_shapes):
            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(
                    end=h, dtype=dtype),
                paddle.arange(
                    end=w, dtype=dtype))
            grid_xy = paddle.stack([grid_x, grid_y], -1)

            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))

        anchors = paddle.concat(anchors, 1)
        valid_mask = ((anchors > self.eps) *
                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
        anchors = paddle.log(anchors / (1 - anchors))
        anchors = paddle.where(valid_mask, anchors,
                               paddle.to_tensor(float("inf")))
        return anchors, valid_mask

    def _get_decoder_input(self,
                           memory,
                           spatial_shapes,
                           denoising_class=None,
                           denoising_bbox_unact=None,
                           is_teacher=False):
        bs, _, _ = memory.shape
        # prepare input for decoder
        if self.training or self.eval_size is None or is_teacher:
            anchors, valid_mask = self._generate_anchors(spatial_shapes)
        else:
            anchors, valid_mask = self.anchors, self.valid_mask
        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        output_memory = self.enc_output(memory)

        enc_outputs_class = self.enc_score_head(output_memory)
        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors

        _, topk_ind = paddle.topk(
            enc_outputs_class.max(-1), self.num_queries, axis=1)
        # extract region proposal boxes
        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
                                                  topk_ind)  # unsigmoided.
        enc_topk_bboxes = F.sigmoid(reference_points_unact)
        if denoising_bbox_unact is not None:
            reference_points_unact = paddle.concat(
                [denoising_bbox_unact, reference_points_unact], 1)
        if self.training:
            reference_points_unact = reference_points_unact.detach()
        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)

        # extract region features
        if self.learnt_init_query:
            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
        else:
            target = paddle.gather_nd(output_memory, topk_ind)
            if self.training:
                target = target.detach()
        if denoising_class is not None:
            target = paddle.concat([denoising_class, target], 1)

        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits


================================================
FILE: ppdet/modeling/transformers/rtdetr_transformerv3.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay

from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from ..heads.detr_head import MLP
from .deformable_transformer import MSDeformableAttention
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
                           bias_init_with_prob)
from .utils import (_get_clones, get_sine_pos_embed,
                    get_contrastive_denoising_training_group, inverse_sigmoid)

__all__ = ['RTDETRTransformerv3']


class PPMSDeformableAttention(MSDeformableAttention):
    def forward(self,
                query,
                reference_points,
                value,
                value_spatial_shapes,
                value_level_start_index,
                value_mask=None):
        """
        Args:
            query (Tensor): [bs, query_length, C]
            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (Tensor): [bs, value_length, C]
            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, Len_q = query.shape[:2]
        Len_v = value.shape[1]

        value = self.value_proj(value)
        if value_mask is not None:
            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
            value *= value_mask
        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])

        sampling_offsets = self.sampling_offsets(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
        attention_weights = self.attention_weights(query).reshape(
            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
        attention_weights = F.softmax(attention_weights).reshape(
            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])

        if reference_points.shape[-1] == 2:
            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
            offset_normalizer = offset_normalizer.flip([1]).reshape(
                [1, 1, 1, self.num_levels, 1, 2])
            sampling_locations = reference_points.reshape([
                bs, Len_q, 1, self.num_levels, 1, 2
            ]) + sampling_offsets / offset_normalizer
        elif reference_points.shape[-1] == 4:
            sampling_locations = (
                reference_points[:, :, None, :, None, :2] + sampling_offsets /
                self.num_points * reference_points[:, :, None, :, None, 2:] *
                0.5)
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".
                format(reference_points.shape[-1]))

        if not isinstance(query, paddle.Tensor):
            from ppdet.modeling.transformers.utils import deformable_attention_core_func
            output = deformable_attention_core_func(
                value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights)
        else:
            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
            value_level_start_index = paddle.to_tensor(value_level_start_index)
            output = self.ms_deformable_attn_core(
                value, value_spatial_shapes, value_level_start_index,
                sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output


class TransformerDecoderLayer(nn.Layer):
    def __init__(self,
                 d_model=256,
                 n_head=8,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 n_levels=4,
                 n_points=4,
                 weight_attr=None,
                 bias_attr=None):
        super(TransformerDecoderLayer, self).__init__()

        # self attention
        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # cross attention
        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
                                                  n_points, 1.0)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))

        # ffn
        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
                                 bias_attr)
        self.activation = getattr(F, activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
                                 bias_attr)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(
            d_model,
            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
        self._reset_parameters()

    def _reset_parameters(self):
        linear_init_(self.linear1)
        linear_init_(self.linear2)
        xavier_uniform_(self.linear1.weight)
        xavier_uniform_(self.linear2.weight)

    def with_pos_embed(self, tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))

    def forward(self,
                tgt,
                reference_points,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                attn_mask=None,
                memory_mask=None,
                query_pos_embed=None):
        # self attention
        q = k = self.with_pos_embed(tgt, query_pos_embed)
        if attn_mask is not None:
            attn_mask = paddle.where(
                attn_mask.astype('bool'),
                paddle.zeros(attn_mask.shape, tgt.dtype),
                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # cross attention
        tgt2 = self.cross_attn(
            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
            memory_spatial_shapes, memory_level_start_index, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # ffn
        tgt2 = self.forward_ffn(tgt)
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)

        return tgt


class TransformerDecoder(nn.Layer):
    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
        super(TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

    def forward(self,
                tgt,
                ref_points_unact,
                memory,
                memory_spatial_shapes,
                memory_level_start_index,
                bbox_head,
                score_head,
                query_pos_head,
                attn_mask=None,
                memory_mask=None,
                query_pos_head_inv_sig=False):
        output = tgt
        dec_out_bboxes = []
        dec_out_logits = []
        ref_points_detach = F.sigmoid(ref_points_unact)
        for i, layer in enumerate(self.layers):
            ref_points_input = ref_points_detach.unsqueeze(2)
            if not query_pos_head_inv_sig:
                query_pos_embed = query_pos_head(ref_points_detach)
            else:
                query_pos_embed = query_pos_head(
                    inverse_sigmoid(ref_points_detach))

            output = layer(output, ref_points_input, memory,
                           memory_spatial_shapes, memory_level_start_index,
                           attn_mask, memory_mask, query_pos_embed)

            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                ref_points_detach))

            if self.training:
                dec_out_logits.append(score_head[i](output))
                if i == 0:
                    dec_out_bboxes.append(inter_ref_bbox)
                else:
                    dec_out_bboxes.append(
                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
                            ref_points)))
            elif i == self.eval_idx:
                dec_out_logits.append(score_head[i](output))
                dec_out_bboxes.append(inter_ref_bbox)
                break

            ref_points = inter_ref_bbox
            ref_points_detach = inter_ref_bbox.detach(
            ) if self.training else inter_ref_bbox

        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)


@register
class RTDETRTransformerv3(nn.Layer):
    __shared__ = ['num_classes', 'hidden_dim', 'eval_size',
                  'o2m_branch', 'num_queries_o2m']

    def __init__(self,
                 num_classes=80,
                 hidden_dim=256,
                 num_queries=300,
                 position_embed_type='sine',
                 backbone_feat_channels=[512, 1024, 2048],
                 feat_strides=[8, 16, 32],
                 num_levels=3,
                 num_decoder_points=4,
                 nhead=8,
                 num_decoder_layers=6,
                 dim_feedforward=1024,
                 dropout=0.,
                 activation="relu",
                 num_denoising=100,
                 label_noise_ratio=0.5,
                 box_noise_scale=1.0,
                 learnt_init_query=True,
                 query_pos_head_inv_sig=False,
                 eval_size=None,
                 eval_idx=-1,
                 num_noises=0,
                 num_noise_queries=[],
                 num_noise_denoising=100,
                 o2m_branch=False,
                 num_queries_o2m=450,
                 eps=1e-2):
        super(RTDETRTransformerv3, self).__init__()
        assert position_embed_type in ['sine', 'learned'], \
            f'ValueError: position_embed_type not supported {position_embed_type}!'
        assert len(backbone_feat_channels) <= num_levels
        assert len(feat_strides) == len(backbone_feat_channels)
        assert len(num_noise_queries) == num_noises
        for _ in range(num_levels - len(feat_strides)):
            feat_strides.append(feat_strides[-1] * 2)

        self.hidden_dim = hidden_dim
        self.nhead = nhead
        self.feat_strides = feat_strides
        self.num_levels = num_levels
        self.num_classes = num_classes
        self.num_queries = [num_queries]
        self.eps = eps
        self.num_decoder_layers = num_decoder_layers
        self.eval_size = eval_size

        self.num_noises = num_noises
        self.num_noise_denoising = num_noise_denoising
        self.num_groups = 1
        if num_noises > 0:
            self.num_queries.extend(num_noise_queries)
            self.num_groups += num_noises
        
        self.o2m_branch = o2m_branch
        self.num_queries_o2m = num_queries_o2m
        if o2m_branch:
            self.num_queries.append(num_queries_o2m)
            self.num_groups += 1

        # backbone feature projection
        self._build_input_proj_layer(backbone_feat_channels)

        # Transformer module
        decoder_layer = TransformerDecoderLayer(
            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
            num_decoder_points)
        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
                                          num_decoder_layers, eval_idx)

        # denoising part
        self.denoising_class_embed = nn.Embedding(
            num_classes,
            hidden_dim,
            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
        self.num_denoising = num_denoising
        self.label_noise_ratio = label_noise_ratio
        self.box_noise_scale = box_noise_scale

        # decoder embedding
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
        self.query_pos_head_inv_sig = query_pos_head_inv_sig

        # encoder head
        self.enc_output = nn.LayerList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.LayerNorm(
                    hidden_dim,
                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
            for _ in range(self.num_groups)
        ])
        self.enc_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(self.num_groups)
        ])
        self.enc_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(self.num_groups)
        ])

        self.map_memory = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(
                hidden_dim,
                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
            )

        # decoder head
        self.dec_score_head = nn.LayerList([
            nn.Linear(hidden_dim, num_classes)
            for _ in range(num_decoder_layers)
        ])
        self.dec_bbox_head = nn.LayerList([
            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    def _reset_parameters(self):
        # class and bbox head init
        bias_cls = bias_init_with_prob(0.01)
        for enc_score_head in self.enc_score_head:
            linear_init_(enc_score_head)
            constant_(enc_score_head.bias, bias_cls)
        for enc_bbox_head in self.enc_bbox_head:
            constant_(enc_bbox_head.layers[-1].weight)
            constant_(enc_bbox_head.layers[-1].bias)
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            linear_init_(cls_)
            constant_(cls_.bias, bias_cls)
            constant_(reg_.layers[-1].weight)
            constant_(reg_.layers[-1].bias)

        for enc_output in self.enc_output:
            linear_init_(enc_output[0])
            xavier_uniform_(enc_output[0].weight)
        linear_init_(self.map_memory[0])
        xavier_uniform_(self.map_memory[0].weight)

        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)
        for l in self.input_proj:
            xavier_uniform_(l[0].weight)

        # init encoder output anchors and valid_mask
        if self.eval_size:
            self.anchors, self.valid_mask = self._generate_anchors()

    @classmethod
    def from_config(cls, cfg, input_shape):
        return {'backbone_feat_channels': [i.channels for i in input_shape]}

    def _build_input_proj_layer(self, backbone_feat_channels):
        self.input_proj = nn.LayerList()
        for in_channels in backbone_feat_channels:
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
        in_channels = backbone_feat_channels[-1]
        for _ in range(self.num_levels - len(backbone_feat_channels)):
            self.input_proj.append(
                nn.Sequential(
                    ('conv', nn.Conv2D(
                        in_channels,
                        self.hidden_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        bias_attr=False)), ('norm', nn.BatchNorm2D(
                            self.hidden_dim,
                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
            in_channels = self.hidden_dim

    def _get_encoder_input(self, feats):
        # get projection features
        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
        if self.num_levels > len(proj_feats):
            len_srcs = len(proj_feats)
            for i in range(len_srcs, self.num_levels):
                if i == len_srcs:
                    proj_feats.append(self.input_proj[i](feats[-1]))
                else:
                    proj_feats.append(self.input_proj[i](proj_feats[-1]))

        # get encoder inputs
        feat_flatten = []
        spatial_shapes = []
        level_start_index = [0, ]
        for i, feat in enumerate(proj_feats):
            _, _, h, w = feat.shape
            # [b, c, h, w] -> [b, h*w, c]
            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
            # [num_levels, 2]
            spatial_shapes.append([h, w])
            # [l], start index of each level
            level_start_index.append(h * w + level_start_index[-1])

        # [b, l, c]
        feat_flatten = paddle.concat(feat_flatten, 1)
        level_start_index.pop()
        return (feat_flatten, spatial_shapes, level_start_index)

    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
        # input projection and embedding
        (memory, spatial_shapes,
         level_start_index) = self._get_encoder_input(feats)

        # prepare denoising training
        if self.training:
            denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = [], [], [], []
            for g_id in range(self.num_noises + 1):
                if g_id == 0:
                    num_denoising = self.num_denoising
                else:
                    num_denoising = self.num_noise_denoising
                denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
                    get_contrastive_denoising_training_group(gt_meta,
                                                self.num_classes,
                                                self.num_queries[g_id],
                                                self.denoising_class_embed.weight,
                                                num_denoising,
                                                self.label_noise_ratio,
                                                self.box_noise_scale)
                denoising_classes.append(denoising_class)
                denoising_bbox_unacts.append(denoising_bbox_unact)
                attn_masks.append(attn_mask)
                dn_metas.append(dn_meta)
        else:
            denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = None, None, None, None

        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
            self._get_decoder_input(
                memory, spatial_shapes, denoising_classes, denoising_bbox_unacts, is_teacher)

        # multi group noise attention
        if self.training:
            new_size = target.shape[1]
            new_attn_mask = paddle.ones([new_size, new_size]) < 0
            begin, end = 0, 0
            mask = None
            for g_id in range(self.num_groups):
                new_mask = paddle.rand([self.num_queries[g_id], self.num_queries[g_id]])
                if self.o2m_branch and g_id == self.num_groups - 1:
                    end = end + self.num_queries_o2m
                    new_mask = new_mask >= 0.0
                    new_attn_mask[begin: end, begin: end] = new_mask
                else:
                    end = end + attn_masks[g_id].shape[1]
                    dn_size, q_size = dn_metas[g_id]['dn_num_split']
                    if g_id > 0:
                        new_mask = new_mask > 0.1
                    else:
                        new_mask = new_mask >= 0.0
                    attn_masks[g_id][dn_size: dn_size + q_size, dn_size: dn_size + q_size] = new_mask
                    new_attn_mask[begin: end, begin: end] = attn_masks[g_id]
                begin = end
            attn_masks = new_attn_mask

        # decoder
        out_bboxes, out_logits = self.decoder(
            target,
            init_ref_points_unact,
            memory,
            spatial_shapes,
            level_start_index,
            self.dec_bbox_head,
            self.dec_score_head,
            self.query_pos_head,
            attn_mask=attn_masks,
            memory_mask=None,
            query_pos_head_inv_sig=self.query_pos_head_inv_sig)
        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
                dn_metas)

    def _generate_anchors(self,
                          spatial_shapes=None,
                          grid_size=0.05,
                          dtype="float32"):
        if spatial_shapes is None:
            spatial_shapes = [
                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
                for s in self.feat_strides
            ]
        anchors = []
        for lvl, (h, w) in enumerate(spatial_shapes):
            grid_y, grid_x = paddle.meshgrid(
                paddle.arange(
                    end=h, dtype=dtype),
                paddle.arange(
                    end=w, dtype=dtype))
            grid_xy = paddle.stack([grid_x, grid_y], -1)

            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
            anchors.append(
                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))

        anchors = paddle.concat(anchors, 1)
        valid_mask = ((anchors > self.eps) *
                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
        anchors = paddle.log(anchors / (1 - anchors))
        anchors = paddle.where(valid_mask, anchors,
                               paddle.to_tensor(float("inf")))
        return anchors, valid_mask

    def _get_decoder_input(self,
                           memory,
                           spatial_shapes,
                           denoising_classes=None,
                           denoising_bbox_unacts=None,
                           is_teacher=False):
        bs, _, _ = memory.shape
        # prepare input for decoder
        if self.training or self.eval_size is None or is_teacher:
            anchors, valid_mask = self._generate_anchors(spatial_shapes)
        else:
            anchors, valid_mask = self.anchors, self.valid_mask
        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
        map_memory = self.map_memory(memory.detach())
        targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits = [], [], [], []

        for g_id in range(self.num_groups):
            output_memory = self.enc_output[g_id](memory)
            enc_outputs_class = self.enc_score_head[g_id](output_memory)
            enc_outputs_coord_unact = self.enc_bbox_head[g_id](output_memory) + anchors

            _, topk_ind = paddle.topk(
                enc_outputs_class.max(-1), self.num_queries[g_id], axis=1)
            # extract region proposal boxes
            batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
            batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries[g_id]])
            topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)

            reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind)  # unsigmoided.
            enc_topk_bbox = F.sigmoid(reference_points_unact)
            enc_topk_logit = paddle.gather_nd(enc_outputs_class, topk_ind)

            if denoising_bbox_unacts is not None and not (self.o2m_branch and g_id == self.num_groups - 1):
                reference_points_unact = paddle.concat(
                    [denoising_bbox_unacts[g_id], reference_points_unact], 1)
            if self.training:
                reference_points_unact = reference_points_unact.detach()

            # extract region features
            if self.learnt_init_query:
                target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
            else:
                if g_id == 0:
                    target = paddle.gather_nd(output_memory, topk_ind)
                    if self.training:
                        target = target.detach()
                else:
                    target = paddle.gather_nd(map_memory, topk_ind)
            if denoising_classes is not None and not (self.o2m_branch and g_id == self.num_groups - 1):
                target = paddle.concat([denoising_classes[g_id], target], 1)
            
            if not self.training:
                return target, reference_points_unact, enc_topk_bbox, enc_topk_logit
            
            targets.append(target)
            reference_points_unacts.append(reference_points_unact)
            enc_topk_bboxes.append(enc_topk_bbox)
            enc_topk_logits.append(enc_topk_logit)

        targets = paddle.concat(targets, 1)
        reference_points_unacts = paddle.concat(reference_points_unacts, 1)
        enc_topk_bboxes = paddle.concat(enc_topk_bboxes, 1)
        enc_topk_logits = paddle.concat(enc_topk_logits, 1)
        return targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits


================================================
FILE: ppdet/modeling/transformers/utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ..bbox_utils import bbox_overlaps

__all__ = [
    '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',
    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
    'deformable_attention_core_func', 'varifocal_loss_with_logits'
]


def _get_clones(module, N):
    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])


def bbox_cxcywh_to_xyxy(x):
    cxcy, wh = paddle.split(x, 2, axis=-1)
    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)


def bbox_xyxy_to_cxcywh(x):
    x1, y1, x2, y2 = x.split(4, axis=-1)
    return paddle.concat([(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)],
                         axis=-1)


def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
    prob = F.sigmoid(logit)
    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
    p_t = prob * label + (1 - prob) * (1 - label)
    loss = ce_loss * ((1 - p_t)**gamma)

    if alpha >= 0:
        alpha_t = alpha * label + (1 - alpha) * (1 - label)
        loss = alpha_t * loss
    return loss.mean(1).sum() / normalizer


def inverse_sigmoid(x, eps=1e-5):
    x = x.clip(min=0., max=1.)
    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))


def deformable_attention_core_func(value, value_spatial_shapes,
                                   value_level_start_index, sampling_locations,
                                   attention_weights):
    """
    Args:
        value (Tensor): [bs, value_length, n_head, c]
        value_spatial_shapes (Tensor|List): [n_levels, 2]
        value_level_start_index (Tensor|List): [n_levels]
        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]

    Returns:
        output (Tensor): [bs, Length_{query}, C]
    """
    bs, _, n_head, c = value.shape
    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape

    split_shape = [h * w for h, w in value_spatial_shapes]
    value_list = value.split(split_shape, axis=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for level, (h, w) in enumerate(value_spatial_shapes):
        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
        value_l_ = value_list[level].flatten(2).transpose([0, 2, 1]).reshape(
            [bs * n_head, c, h, w])
        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
        sampling_grid_l_ = sampling_grids[:, :, :,
                                          level].transpose([0, 2, 1, 3,
                                                            4]).flatten(0, 1)
        # N_*M_, D_, Lq_, P_
        sampling_value_l_ = F.grid_sample(value_l_,
                                          sampling_grid_l_,
                                          mode='bilinear',
                                          padding_mode='zeros',
                                          align_corners=False)
        sampling_value_list.append(sampling_value_l_)
    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
        [bs * n_head, 1, Len_q, n_levels * n_points])
    output = (paddle.stack(sampling_value_list, axis=-2).flatten(-2) *
              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])

    return output.transpose([0, 2, 1])


def discrete_sample(x, grid):
    """
    Args:
        x (Tensor): [N, C, H, W]
        grid (Tensor): [N, grid_H, grid_W, 2]
    Returns:
        output (Tensor): [N, C, grid_H, grid_W]
    """
    N, C, H, W = x.shape
    _, grid_H, grid_W, _ = grid.shape
    spatial_shape = paddle.to_tensor([[W, H]], dtype=paddle.float32)
    index = (grid * spatial_shape + 0.5).astype(paddle.int64).flatten(1, 2)
    h_index = index[:, :, 1].clip(0, H - 1)
    w_index = index[:, :, 0].clip(0, W - 1)
    batch_index = paddle.arange(N).unsqueeze(-1).tile([1, grid_H * grid_W])
    output = x[batch_index, :, h_index, w_index]
    output = output.transpose([0, 2, 1]).reshape([N, C, grid_H, grid_W])
    return output


def deformable_attention_core_func_v2(value,
                                      value_spatial_shapes,
                                      sampling_locations,
                                      attention_weights,
                                      num_points_list,
                                      sampling_method='default'):
    """
    Args:
        value (Tensor): [batch_num, value_len, num_heads, head_dim]
        value_spatial_shapes (Tensor|List): [n_levels, 2]
        sampling_locations (Tensor): [batch_num, query_len, num_heads, total_num_points, 2]
        attention_weights (Tensor): [batch_num, query_len, num_heads, total_num_points]
        num_points_list (List): The number of sampling point corresponding to each level
        sampling_method (str): default(grid_sample) or discrete(discrete_sample)

    Returns:
        output (Tensor): [batch_num, query_len, num_heads * head_dim]
    """
    assert sampling_method in ['default', 'discrete'], NotImplementedError
    batch_num, _, num_heads, head_dim = value.shape
    query_len = sampling_locations.shape[1]
    num_levels = len(num_points_list)

    value = value.transpose([0, 2, 3, 1]).flatten(0, 1)
    split_shape = [h * w for h, w in value_spatial_shapes]
    value_list = value.split(split_shape, axis=-1)
    value_list = [
        value.reshape([batch_num * num_heads, head_dim, h, w])
        for value, (h, w) in zip(value_list, value_spatial_shapes)
    ]

    if sampling_method == 'default':
        sampling_grids = 2 * sampling_locations - 1
    else:
        sampling_grids = sampling_locations

    sampling_grids = sampling_grids.transpose([0, 2, 1, 3, 4]).flatten(0, 1)
    sampling_grids_list = sampling_grids.split(num_points_list, axis=-2)

    sampling_value_list = []
    for idx in range(num_levels):
        # value_list[idx]: [batch_num * num_heads, head_dim, h, w]
        # sampling_grids_list[idx]: [batch_num * num_heads, query_len, num_points, 2]
        # _sampling_value: [batch_num * num_heads, head_dim, query_len, num_points]
        if sampling_method == 'default':
            _sampling_value = F.grid_sample(value_list[idx],
                                            sampling_grids_list[idx],
                                            mode='bilinear',
                                            padding_mode='zeros',
                                            align_corners=False)
        else:
            _sampling_value = discrete_sample(value_list[idx],
                                              sampling_grids_list[idx])
        sampling_value_list.append(_sampling_value)

    attn_weights = attention_weights.transpose([0, 2, 1, 3])
    attn_weights = attn_weights.flatten(0, 1).unsqueeze(1)
    sampling_value = paddle.concat(sampling_value_list, axis=-1)
    # attn_weights: [batch_num * num_heads, 1, query_len, total_num_points]
    # sampling_value: [batch_num * num_heads, head_dim, query_len, total_num_points]
    # output: [batch_num * num_heads, head_dim, query_len]
    output = (sampling_value * attn_weights).sum(-1)
    output = output.reshape([batch_num, num_heads * head_dim, query_len])
    return output.transpose([0, 2, 1])


def get_valid_ratio(mask):
    _, H, W = mask.shape
    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
    # [b, 2]
    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)


def get_denoising_training_group(targets,
                                 num_classes,
                                 num_queries,
                                 class_embed,
                                 num_denoising=100,
                                 label_noise_ratio=0.5,
                                 box_noise_scale=1.0):
    if num_denoising <= 0:
        return None, None, None, None
    num_gts = [len(t) for t in targets["gt_class"]]
    max_gt_num = max(num_gts)
    if max_gt_num == 0:
        return None, None, None, None

    num_group = num_denoising // max_gt_num
    num_group = 1 if num_group == 0 else num_group
    # pad gt to max_num of a batch
    bs = len(targets["gt_class"])
    input_query_class = paddle.full([bs, max_gt_num],
                                    num_classes,
                                    dtype='int32')
    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
    pad_gt_mask = paddle.zeros([bs, max_gt_num])
    for i in range(bs):
        num_gt = num_gts[i]
        if num_gt > 0:
            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
            pad_gt_mask[i, :num_gt] = 1

    input_query_class = input_query_class.tile([1, num_group])
    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
    pad_gt_mask = pad_gt_mask.tile([1, num_group])

    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
    dn_positive_idx = paddle.split(dn_positive_idx,
                                   [n * num_group for n in num_gts])
    # total denoising queries
    num_denoising = int(max_gt_num * num_group)

    if label_noise_ratio > 0:
        input_query_class = input_query_class.flatten()
        pad_gt_mask = pad_gt_mask.flatten()
        # half of bbox prob, cast mask from bool to float bacause dtype promotaion
        # between bool and float is not supported in static mode.
        mask = paddle.cast(
            paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5),
            paddle.float32)
        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
        # randomly put a new one here
        new_label = paddle.randint_like(chosen_idx,
                                        0,
                                        num_classes,
                                        dtype=input_query_class.dtype)
        input_query_class.scatter_(chosen_idx, new_label)
        input_query_class.reshape_([bs, num_denoising])
        pad_gt_mask.reshape_([bs, num_denoising])

    if box_noise_scale > 0:
        diff = paddle.concat(
            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
            axis=-1) * box_noise_scale
        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
        input_query_bbox += diff
        input_query_bbox = inverse_sigmoid(input_query_bbox)

    class_embed = paddle.concat(
        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
    input_query_class = paddle.gather(class_embed,
                                      input_query_class.flatten(),
                                      axis=0).reshape([bs, num_denoising, -1])

    tgt_size = num_denoising + num_queries
    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
    # match query cannot see the reconstruction
    attn_mask[num_denoising:, :num_denoising] = True
    # reconstruct cannot see each other
    for i in range(num_group):
        if i == 0:
            attn_mask[max_gt_num * i:max_gt_num * (i + 1),
                      max_gt_num * (i + 1):num_denoising] = True
        if i == num_group - 1:
            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
                      i] = True
        else:
            attn_mask[max_gt_num * i:max_gt_num * (i + 1),
                      max_gt_num * (i + 1):num_denoising] = True
            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
                      i] = True
    attn_mask = ~attn_mask
    dn_meta = {
        "dn_positive_idx": dn_positive_idx,
        "dn_num_group": num_group,
        "dn_num_split": [num_denoising, num_queries]
    }

    return input_query_class, input_query_bbox, attn_mask, dn_meta


def get_contrastive_denoising_training_group(targets,
                                             num_classes,
                                             num_queries,
                                             class_embed,
                                             num_denoising=100,
                                             label_noise_ratio=0.5,
                                             box_noise_scale=1.0):
    if num_denoising <= 0:
        return None, None, None, None
    # listcomp is not well-supported in SOT mode for now.
    num_gts = []
    for t in targets["gt_class"]:
        num_gts.append(len(t))
    max_gt_num = max(num_gts)
    if max_gt_num == 0:
        return None, None, None, None

    num_group = num_denoising // max_gt_num
    num_group = 1 if num_group == 0 else num_group
    # pad gt to max_num of a batch
    bs = len(targets["gt_class"])
    input_query_class = paddle.full([bs, max_gt_num],
                                    num_classes,
                                    dtype='int32')
    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
    pad_gt_mask = paddle.zeros([bs, max_gt_num])
    for i in range(bs):
        num_gt = num_gts[i]
        if num_gt > 0:
            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
            pad_gt_mask[i, :num_gt] = 1
    # each group has positive and negative queries.
    input_query_class = input_query_class.tile([1, 2 * num_group])
    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
    # positive and negative mask
    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
    negative_gt_mask[:, max_gt_num:] = 1
    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
    positive_gt_mask = 1 - negative_gt_mask
    # contrastive denoising training positive index
    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
    dn_positive_idx = paddle.split(dn_positive_idx,
                                   [n * num_group for n in num_gts])
    # total denoising queries
    num_denoising = int(max_gt_num * 2 * num_group)

    if label_noise_ratio > 0:
        input_query_class = input_query_class.flatten()
        pad_gt_mask = pad_gt_mask.flatten()
        # half of bbox prob
        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
        chosen_idx = paddle.nonzero(mask.cast(pad_gt_mask.dtype) *
                                    pad_gt_mask).squeeze(-1)
        # randomly put a new one here
        new_label = paddle.randint_like(chosen_idx,
                                        0,
                                        num_classes,
                                        dtype=input_query_class.dtype)
        input_query_class.scatter_(chosen_idx, new_label)
        input_query_class.reshape_([bs, num_denoising])
        pad_gt_mask.reshape_([bs, num_denoising])

    if box_noise_scale > 0:
        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)

        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
                           [1, 1, 2]) * box_noise_scale

        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
        rand_part = paddle.rand(input_query_bbox.shape)
        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
            1 - negative_gt_mask)
        rand_part *= rand_sign
        known_bbox += rand_part * diff
        known_bbox.clip_(min=0.0, max=1.0)
        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
        input_query_bbox = inverse_sigmoid(input_query_bbox)

    class_embed = paddle.concat(
        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
    input_query_class = paddle.gather(class_embed,
                                      input_query_class.flatten(),
                                      axis=0).reshape([bs, num_denoising, -1])

    tgt_size = num_denoising + num_queries
    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
    # match query cannot see the reconstruction
    attn_mask[num_denoising:, :num_denoising] = True
    # reconstruct cannot see each other
    for i in range(num_group):
        if i == 0:
            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1),
                      max_gt_num * 2 * (i + 1):num_denoising] = True
        if i == num_group - 1:
            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
                      i * 2] = True
        else:
            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1),
                      max_gt_num * 2 * (i + 1):num_denoising] = True
            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
                      2 * i] = True
    attn_mask = ~attn_mask
    dn_meta = {
        "dn_positive_idx": dn_positive_idx,
        "dn_num_group": num_group,
        "dn_num_split": [num_denoising, num_queries]
    }

    return input_query_class, input_query_bbox, attn_mask, dn_meta


def get_sine_pos_embed(pos_tensor,
                       num_pos_feats=128,
                       temperature=10000,
                       exchange_xy=True):
    """generate sine position embedding from a position tensor

    Args:
        pos_tensor (Tensor): Shape as `(None, n)`.
        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
        temperature (int): The temperature used for scaling
            the position embedding. Default: 10000.
        exchange_xy (bool, optional): exchange pos x and pos y. \
            For example, input tensor is `[x, y]`, the results will  # noqa
            be `[pos(y), pos(x)]`. Defaults: True.

    Returns:
        Tensor: Returned position embedding  # noqa
        with shape `(None, n * num_pos_feats)`.
    """
    scale = 2. * math.pi
    dim_t = 2. * paddle.floor_divide(paddle.arange(num_pos_feats),
                                     paddle.to_tensor(2))
    dim_t = scale / temperature**(dim_t / num_pos_feats)

    def sine_func(x):
        x *= dim_t
        return paddle.stack((x[:, :, 0::2].sin(), x[:, :, 1::2].cos()),
                            axis=3).flatten(2)

    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
    if exchange_xy:
        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
    pos_res = paddle.concat(pos_res, axis=2)
    return pos_res


def mask_to_box_coordinate(mask,
                           normalize=False,
                           format="xyxy",
                           dtype="float32"):
    """
    Compute the bounding boxes around the provided mask.
    Args:
        mask (Tensor:bool): [b, c, h, w]

    Returns:
        bbox (Tensor): [b, c, 4]
    """
    assert mask.ndim == 4
    assert format in ["xyxy", "xywh"]

    h, w = mask.shape[-2:]
    y, x = paddle.meshgrid(paddle.arange(end=h, dtype=dtype),
                           paddle.arange(end=w, dtype=dtype))

    x_mask = x * mask.astype(x.dtype)
    x_max = x_mask.flatten(-2).max(-1) + 1
    x_min = paddle.where(mask.astype(bool), x_mask,
                         paddle.to_tensor(1e8)).flatten(-2).min(-1)

    y_mask = y * mask.astype(y.dtype)
    y_max = y_mask.flatten(-2).max(-1) + 1
    y_min = paddle.where(mask.astype(bool), y_mask,
                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
    mask = mask.any(axis=[2, 3]).unsqueeze(2)
    out_bbox = out_bbox * mask.astype(out_bbox.dtype)
    if normalize:
        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)

    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)


def varifocal_loss_with_logits(pred_logits,
                               gt_score,
                               label,
                               normalizer=1.0,
                               alpha=0.75,
                               gamma=2.0):
    pred_score = F.sigmoid(pred_logits)
    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
    loss = F.binary_cross_entropy_with_logits(pred_logits,
                                              gt_score,
                                              weight=weight,
                                              reduction='none')
    return loss.mean(1).sum() / normalizer


================================================
FILE: ppdet/optimizer/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import optimizer
from . import ema

from .optimizer import *
from .ema import *


================================================
FILE: ppdet/optimizer/adamw.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from paddle.optimizer import AdamW
from functools import partial
import re

IS_PADDLE_LATER_2_4 = (
    int(paddle.version.major) >= 2 and
    int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0


def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
    """
    Args:
        decay_rate (float): 
            The layer-wise decay ratio.
        name_dict (dict): 
            The keys of name_dict is dynamic name of model while the value
            of name_dict is static name.
            Use model.named_parameters() to get name_dict.
        n_layers (int):
            Total number of layers in the transformer encoder.
    """
    ratio = 1.0
    static_name = name_dict[param.name]
    if 'blocks.' in static_name or 'layers.' in static_name:
        idx_1 = static_name.find('blocks.')
        idx_2 = static_name.find('layers.')
        assert any([x >= 0 for x in [idx_1, idx_2]]), ''
        idx = idx_1 if idx_1 >= 0 else idx_2
        # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0]

        layer = int(static_name[idx:].split('.')[1])
        ratio = decay_rate**(n_layers - layer)

    elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name:
        ratio = decay_rate**(n_layers + 1)

    if IS_PADDLE_LATER_2_4:
        return ratio
    else:
        param.optimize_attr['learning_rate'] *= ratio


class AdamWDL(AdamW):
    r"""
    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
    Generally it's used for transformer model.

    We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL.
    “Layer-wise decay” means exponentially decaying the learning rates of individual 
    layers in a top-down manner. For example, suppose the 24-th layer uses a learning
    rate l, and the Layer-wise decay rate is α, then the learning rate of layer m 
    is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237.

    .. math::
        & t = t + 1
    
        & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad

        & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad

        & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}

        & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)

    Args:
        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a LRScheduler. The default value is 0.001.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 1e-08.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
        apply_decay_param_fun (function|None, optional): If it is not None,
            only tensors that makes apply_decay_param_fun(Tensor.name)==True
            will be updated. It only works when we want to specify tensors.
            Default: None.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
            The accumulators are updated at every step. Every element of the two moving-average
            is updated in both dense mode and sparse mode. If the size of parameter is very large,
            then the update may be very slow. The lazy mode only update the element that has
            gradient in current mini-batch, so it will be much more faster. But this mode has
            different semantics with the original Adam algorithm and may lead to different result.
            The default value is False.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.  
        layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0.
        n_layers (int, optional): The total number of encoder layers. Defaults to 12.
        set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter 
            learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`.
        name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value
            of name_dict is static name. Use model.named_parameters() to get name_dict.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Examples:
        .. code-block:: python

            import paddle
            from paddlenlp.ops.optimizer import AdamWDL
            def simple_lr_setting(decay_rate, name_dict, n_layers, param):
                ratio = 1.0
                static_name = name_dict[param.name]
                if "weight" in static_name:
                    ratio = decay_rate**0.5
                param.optimize_attr["learning_rate"] *= ratio
            
            linear = paddle.nn.Linear(10, 10)

            name_dict = dict()
            for n, p in linear.named_parameters():
                name_dict[p.name] = n

            inp = paddle.rand([10,10], dtype="float32")
            out = linear(inp)
            loss = paddle.mean(out)

            adamwdl = AdamWDL(
                learning_rate=1e-4,
                parameters=linear.parameters(),
                set_param_lr_fun=simple_lr_setting,
                layerwise_decay=0.8,
                name_dict=name_dict)
            
            loss.backward()
            adamwdl.step()
            adamwdl.clear_grad()
    """

    def __init__(self,
                 learning_rate=0.001,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-8,
                 parameters=None,
                 weight_decay=0.01,
                 apply_decay_param_fun=None,
                 grad_clip=None,
                 lazy_mode=False,
                 multi_precision=False,
                 layerwise_decay=1.0,
                 n_layers=12,
                 set_param_lr_func=None,
                 name_dict=None,
                 name=None):
        if not isinstance(layerwise_decay, float):
            raise TypeError("coeff should be float or Tensor.")
        self.layerwise_decay = layerwise_decay
        self.n_layers = n_layers
        self.set_param_lr_func = partial(
            set_param_lr_func, layerwise_decay, name_dict,
            n_layers) if set_param_lr_func is not None else set_param_lr_func

        if IS_PADDLE_LATER_2_4:
            super(AdamWDL, self).__init__(
                learning_rate=learning_rate,
                parameters=parameters,
                beta1=beta1,
                beta2=beta2,
                epsilon=epsilon,
                grad_clip=grad_clip,
                name=name,
                apply_decay_param_fun=apply_decay_param_fun,
                weight_decay=weight_decay,
                lazy_mode=lazy_mode,
                multi_precision=multi_precision,
                lr_ratio=self.set_param_lr_func)
        else:
            super(AdamWDL, self).__init__(
                learning_rate=learning_rate,
                parameters=parameters,
                beta1=beta1,
                beta2=beta2,
                epsilon=epsilon,
                grad_clip=grad_clip,
                name=name,
                apply_decay_param_fun=apply_decay_param_fun,
                weight_decay=weight_decay,
                lazy_mode=lazy_mode,
                multi_precision=multi_precision)


def _append_optimize_op(self, block, param_and_grad):
    if self.set_param_lr_func is None:
        return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)

    self._append_decoupled_weight_decay(block, param_and_grad)
    prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
    self.set_param_lr_func(param_and_grad[0])
    # excute Adam op
    res = super(AdamW, self)._append_optimize_op(block, param_and_grad)
    param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
    return res


if not IS_PADDLE_LATER_2_4:
    AdamWDL._append_optimize_op = _append_optimize_op


def build_adamwdl(model,
                  lr=1e-4,
                  weight_decay=0.05,
                  betas=(0.9, 0.999),
                  layer_decay=0.65,
                  num_layers=None,
                  filter_bias_and_bn=True,
                  skip_decay_names=None,
                  set_param_lr_func='layerwise_lr_decay'):

    if skip_decay_names and filter_bias_and_bn:
        decay_dict = {
            param.name: not (len(param.shape) == 1 or name.endswith('.bias') or
                             any([_n in name for _n in skip_decay_names]))
            for name, param in model.named_parameters()
        }
        parameters = [p for p in model.parameters()]

    else:
        parameters = model.parameters()

    opt_args = dict(
        parameters=parameters, learning_rate=lr, weight_decay=weight_decay)

    if decay_dict is not None:
        opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]

    if isinstance(set_param_lr_func, str):
        set_param_lr_func = eval(set_param_lr_func)
        opt_args['set_param_lr_func'] = set_param_lr_func

    opt_args['beta1'] = betas[0]
    opt_args['beta2'] = betas[1]

    opt_args['layerwise_decay'] = layer_decay
    name_dict = {p.name: n for n, p in model.named_parameters()}

    opt_args['name_dict'] = name_dict
    opt_args['n_layers'] = num_layers

    optimizer = AdamWDL(**opt_args)

    return optimizer


================================================
FILE: ppdet/optimizer/ema.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import weakref
from copy import deepcopy

from .utils import get_bn_running_state_names

__all__ = ['ModelEMA', 'SimpleModelEMA']


class ModelEMA(object):
    """
    Exponential Weighted Average for Deep Neutal Networks
    Args:
        model (nn.Layer): Detector of model.
        decay (int):  The decay used for updating ema parameter.
            Ema's parameter are updated with the formula:
           `ema_param = decay * ema_param + (1 - decay) * cur_param`.
            Defaults is 0.9998.
        ema_decay_type (str): type in ['threshold', 'normal', 'exponential'],
            'threshold' as default.
        cycle_epoch (int): The epoch of interval to reset ema_param and
            step. Defaults is -1, which means not reset. Its function is to
            add a regular effect to ema, which is set according to experience
            and is effective when the total training epoch is large.
        ema_black_list (set|list|tuple, optional): The custom EMA black_list.
            Blacklist of weight names that will not participate in EMA
            calculation. Default: None.
    """

    def __init__(self,
                 model,
                 decay=0.9998,
                 ema_decay_type='threshold',
                 cycle_epoch=-1,
                 ema_black_list=None,
                 ema_filter_no_grad=False):
        self.step = 0
        self.epoch = 0
        self.decay = decay
        self.ema_decay_type = ema_decay_type
        self.cycle_epoch = cycle_epoch
        self.ema_black_list = self._match_ema_black_list(
            model.state_dict().keys(), ema_black_list)
        bn_states_names = get_bn_running_state_names(model)
        if ema_filter_no_grad:
            for n, p in model.named_parameters():
                if p.stop_gradient and n not in bn_states_names:
                    self.ema_black_list.add(n)

        self.state_dict = dict()
        for k, v in model.state_dict().items():
            if k in self.ema_black_list:
                self.state_dict[k] = v
            else:
                self.state_dict[k] = paddle.zeros_like(v, dtype='float32')

        self._model_state = {
            k: weakref.ref(p)
            for k, p in model.state_dict().items()
        }

    def reset(self):
        self.step = 0
        self.epoch = 0
        for k, v in self.state_dict.items():
            if k in self.ema_black_list:
                self.state_dict[k] = v
            else:
                self.state_dict[k] = paddle.zeros_like(v)

    def resume(self, state_dict, step=0):
        for k, v in state_dict.items():
            if k in self.state_dict:
                if self.state_dict[k].dtype == v.dtype:
                    self.state_dict[k] = v
                else:
                    self.state_dict[k] = v.astype(self.state_dict[k].dtype)
        self.step = step

    def update(self, model=None):
        if self.ema_decay_type == 'threshold':
            decay = min(self.decay, (1 + self.step) / (10 + self.step))
        elif self.ema_decay_type == 'exponential':
            decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000))
        else:
            decay = self.decay
        self._decay = decay

        if model is not None:
            model_dict = model.state_dict()
        else:
            model_dict = {k: p() for k, p in self._model_state.items()}
            assert all(
                [v is not None for _, v in model_dict.items()]), 'python gc.'

        for k, v in self.state_dict.items():
            if k not in self.ema_black_list:
                v = decay * v + (1 - decay) * model_dict[k].astype('float32')
                v.stop_gradient = True
                self.state_dict[k] = v
        self.step += 1

    def apply(self):
        if self.step == 0:
            return self.state_dict
        state_dict = dict()
        model_dict = {k: p() for k, p in self._model_state.items()}
        for k, v in self.state_dict.items():
            if k in self.ema_black_list:
                v.stop_gradient = True
                state_dict[k] = v
            else:
                if self.ema_decay_type != 'exponential':
                    v = v / (1 - self._decay**self.step)
                    v = v.astype(model_dict[k].dtype)
                v.stop_gradient = True
                state_dict[k] = v
        self.epoch += 1
        if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch:
            self.reset()

        return state_dict

    def _match_ema_black_list(self, weight_name, ema_black_list=None):
        out_list = set()
        if ema_black_list:
            for name in weight_name:
                for key in ema_black_list:
                    if key in name:
                        out_list.add(name)
        return out_list


class SimpleModelEMA(object):
    """
    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
    Keep a moving average of everything in the model state_dict (parameters and buffers).
    This is intended to allow functionality like
    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
    A smoothed version of the weights is necessary for some training schemes to perform well.
    This class is sensitive where it is initialized in the sequence of model init,
    GPU assignment and distributed training wrappers.
    """

    def __init__(self, model=None, decay=0.9996):
        """
        Args:
            model (nn.Module): model to apply EMA.
            decay (float): ema decay reate.
        """
        self.model = deepcopy(model)
        self.decay = decay

    def update(self, model, decay=None):
        if decay is None:
            decay = self.decay

        with paddle.no_grad():
            state = {}
            msd = model.state_dict()
            for k, v in self.model.state_dict().items():
                if paddle.is_floating_point(v):
                    v *= decay
                    v += (1.0 - decay) * msd[k].detach()
                state[k] = v
            self.model.set_state_dict(state)

    def resume(self, state_dict, step=0):
        state = {}
        msd = state_dict
        for k, v in self.model.state_dict().items():
            if paddle.is_floating_point(v):
                v = msd[k].detach()
            state[k] = v
        self.model.set_state_dict(state)
        self.step = step


================================================
FILE: ppdet/optimizer/optimizer.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re
import sys
import math
import paddle
import paddle.nn as nn

import paddle.optimizer as optimizer
import paddle.regularizer as regularizer

from ppdet.core.workspace import register, serializable
import copy

from .adamw import AdamWDL, build_adamwdl

__all__ = ['LearningRate', 'OptimizerBuilder']

from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@serializable
class CosineDecay(object):
    """
    Cosine learning rate decay

    Args:
        max_epochs (int): max epochs for the training process.
            if you commbine cosine decay with warmup, it is recommended that
            the max_iters is much larger than the warmup iter
        use_warmup (bool): whether to use warmup. Default: True.
        min_lr_ratio (float): minimum learning rate ratio. Default: 0.
        last_plateau_epochs (int): use minimum learning rate in
            the last few epochs. Default: 0.
    """

    def __init__(self,
                 max_epochs=1000,
                 use_warmup=True,
                 min_lr_ratio=0.,
                 last_plateau_epochs=0):
        self.max_epochs = max_epochs
        self.use_warmup = use_warmup
        self.min_lr_ratio = min_lr_ratio
        self.last_plateau_epochs = last_plateau_epochs

    def __call__(self,
                 base_lr=None,
                 boundary=None,
                 value=None,
                 step_per_epoch=None):
        assert base_lr is not None, "either base LR or values should be provided"

        max_iters = self.max_epochs * int(step_per_epoch)
        last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch)
        min_lr = base_lr * self.min_lr_ratio
        if boundary is not None and value is not None and self.use_warmup:
            # use warmup
            warmup_iters = len(boundary)
            for i in range(int(boundary[-1]), max_iters):
                boundary.append(i)
                if i < max_iters - last_plateau_iters:
                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
                        (i - warmup_iters) * math.pi /
                        (max_iters - warmup_iters - last_plateau_iters)) + 1)
                    value.append(decayed_lr)
                else:
                    value.append(min_lr)
            return optimizer.lr.PiecewiseDecay(boundary, value)
        elif last_plateau_iters > 0:
            # not use warmup, but set `last_plateau_epochs` > 0
            boundary = []
            value = []
            for i in range(max_iters):
                if i < max_iters - last_plateau_iters:
                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
                        i * math.pi / (max_iters - last_plateau_iters)) + 1)
                    value.append(decayed_lr)
                else:
                    value.append(min_lr)
                if i > 0:
                    boundary.append(i)
            return optimizer.lr.PiecewiseDecay(boundary, value)

        return optimizer.lr.CosineAnnealingDecay(
            base_lr, T_max=max_iters, eta_min=min_lr)


@serializable
class PiecewiseDecay(object):
    """
    Multi step learning rate decay

    Args:
        gamma (float | list): decay factor
        milestones (list): steps at which to decay learning rate
    """

    def __init__(self,
                 gamma=[0.1, 0.01],
                 milestones=[8, 11],
                 values=None,
                 use_warmup=True):
        super(PiecewiseDecay, self).__init__()
        if type(gamma) is not list:
            self.gamma = []
            for i in range(len(milestones)):
                self.gamma.append(gamma / 10**i)
        else:
            self.gamma = gamma
        self.milestones = milestones
        self.values = values
        self.use_warmup = use_warmup

    def __call__(self,
                 base_lr=None,
                 boundary=None,
                 value=None,
                 step_per_epoch=None):
        if boundary is not None and self.use_warmup:
            boundary.extend([int(step_per_epoch) * i for i in self.milestones])
        else:
            # do not use LinearWarmup
            boundary = [int(step_per_epoch) * i for i in self.milestones]
            value = [base_lr]  # during step[0, boundary[0]] is base_lr

        # self.values is setted directly in config
        if self.values is not None:
            assert len(self.milestones) + 1 == len(self.values)
            return optimizer.lr.PiecewiseDecay(boundary, self.values)

        # value is computed by self.gamma
        value = value if value is not None else [base_lr]
        for i in self.gamma:
            value.append(base_lr * i)

        return optimizer.lr.PiecewiseDecay(boundary, value)


@serializable
class LinearWarmup(object):
    """
    Warm up learning rate linearly

    Args:
        steps (int): warm up steps
        start_factor (float): initial learning rate factor
        epochs (int|None): use epochs as warm up steps, the priority
            of `epochs` is higher than `steps`. Default: None.
    """

    def __init__(self, steps=500, start_factor=1. / 3, epochs=None, epochs_first=True):
        super(LinearWarmup, self).__init__()
        self.steps = steps
        self.start_factor = start_factor
        self.epochs = epochs
        self.epochs_first = epochs_first

    def __call__(self, base_lr, step_per_epoch):
        boundary = []
        value = []
        if self.epochs_first and self.epochs is not None:
            warmup_steps = self.epochs * step_per_epoch
        else:
            warmup_steps = self.steps
        warmup_steps = max(warmup_steps, 1)
        for i in range(warmup_steps + 1):
            if warmup_steps > 0:
                alpha = i / warmup_steps
                factor = self.start_factor * (1 - alpha) + alpha
                lr = base_lr * factor
                value.append(lr)
            if i > 0:
                boundary.append(i)
        return boundary, value


@serializable
class ExpWarmup(object):
    """
    Warm up learning rate in exponential mode
    Args:
        steps (int): warm up steps.
        epochs (int|None): use epochs as warm up steps, the priority
            of `epochs` is higher than `steps`. Default: None.
        power (int): Exponential coefficient. Default: 2.
    """

    def __init__(self, steps=1000, epochs=None, power=2):
        super(ExpWarmup, self).__init__()
        self.steps = steps
        self.epochs = epochs
        self.power = power

    def __call__(self, base_lr, step_per_epoch):
        boundary = []
        value = []
        warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps
        warmup_steps = max(warmup_steps, 1)
        for i in range(warmup_steps + 1):
            factor = (i / float(warmup_steps))**self.power
            value.append(base_lr * factor)
            if i > 0:
                boundary.append(i)
        return boundary, value


@register
class LearningRate(object):
    """
    Learning Rate configuration

    Args:
        base_lr (float): base learning rate
        schedulers (list): learning rate schedulers
    """
    __category__ = 'optim'

    def __init__(self,
                 base_lr=0.01,
                 schedulers=[PiecewiseDecay(), LinearWarmup()]):
        super(LearningRate, self).__init__()
        self.base_lr = base_lr
        self.schedulers = []

        schedulers = copy.deepcopy(schedulers)
        for sched in schedulers:
            if isinstance(sched, dict):
                # support dict sched instantiate
                module = sys.modules[__name__]
                type = sched.pop("name")
                scheduler = getattr(module, type)(**sched)
                self.schedulers.append(scheduler)
            else:
                self.schedulers.append(sched)

    def __call__(self, step_per_epoch):
        assert len(self.schedulers) >= 1
        if not self.schedulers[0].use_warmup:
            return self.schedulers[0](base_lr=self.base_lr,
                                      step_per_epoch=step_per_epoch)

        # TODO: split warmup & decay
        # warmup
        boundary, value = self.schedulers[1](self.base_lr, step_per_epoch)
        # decay
        decay_lr = self.schedulers[0](self.base_lr, boundary, value,
                                      step_per_epoch)
        return decay_lr


@register
class OptimizerBuilder():
    """
    Build optimizer handles
    Args:
        regularizer (object): an `Regularizer` instance
        optimizer (object): an `Optimizer` instance
    """
    __category__ = 'optim'

    def __init__(self,
                 clip_grad_by_norm=None,
                 clip_grad_by_value=None,
                 regularizer={'type': 'L2',
                              'factor': .0001},
                 optimizer={'type': 'Momentum',
                            'momentum': .9}):
        self.clip_grad_by_norm = clip_grad_by_norm
        self.clip_grad_by_value = clip_grad_by_value
        self.regularizer = regularizer
        self.optimizer = optimizer

    def __call__(self, learning_rate, model=None):
        if self.clip_grad_by_norm is not None:
            grad_clip = nn.ClipGradByGlobalNorm(
                clip_norm=self.clip_grad_by_norm)
        elif self.clip_grad_by_value is not None:
            var = abs(self.clip_grad_by_value)
            grad_clip = nn.ClipGradByValue(min=-var, max=var)
        else:
            grad_clip = None
        if self.regularizer and self.regularizer != 'None':
            reg_type = self.regularizer['type'] + 'Decay'
            reg_factor = self.regularizer['factor']
            regularization = getattr(regularizer, reg_type)(reg_factor)
        else:
            regularization = None

        optim_args = self.optimizer.copy()
        optim_type = optim_args['type']
        del optim_args['type']

        if optim_type == 'AdamWDL':
            return build_adamwdl(model, lr=learning_rate, **optim_args)

        if optim_type != 'AdamW':
            optim_args['weight_decay'] = regularization

        op = getattr(optimizer, optim_type)

        if 'param_groups' in optim_args:
            assert isinstance(optim_args['param_groups'], list), ''

            param_groups = optim_args.pop('param_groups')

            params, visited = [], []
            for group in param_groups:
                assert isinstance(group,
                                  dict) and 'params' in group and isinstance(
                                      group['params'], list), ''
                _params = {}
                for n, p in model.named_parameters():
                    if not p.trainable:
                        continue
                    for k in group['params']:
                        if re.search(k, n):
                            _params.update({n: p})
                            break

                _group = group.copy()
                _group.update({'params': list(_params.values())})

                params.append(_group)
                visited.extend(list(_params.keys()))

            ext_params = [
                p for n, p in model.named_parameters()
                if n not in visited and p.trainable is True
            ]

            if len(ext_params) < len(model.parameters()):
                params.append({'params': ext_params})

            elif len(ext_params) > len(model.parameters()):
                raise RuntimeError

        else:
            _params = model.parameters()
            params = [param for param in _params if param.trainable is True]

        return op(learning_rate=learning_rate,
                  parameters=params,
                  grad_clip=grad_clip,
                  **optim_args)


================================================
FILE: ppdet/optimizer/utils.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn

from typing import List


def get_bn_running_state_names(model: nn.Layer) -> List[str]:
    """Get all bn state full names including running mean and variance
    """
    names = []
    for n, m in model.named_sublayers():
        if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)):
            assert hasattr(m, '_mean'), f'assert {m} has _mean'
            assert hasattr(m, '_variance'), f'assert {m} has _variance'
            running_mean = f'{n}._mean'
            running_var = f'{n}._variance'
            names.extend([running_mean, running_var])

    return names


================================================
FILE: ppdet/slim/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import distill_loss
from . import distill_model
from . import ofa
from . import prune
from . import quant
from . import unstructured_prune

from .distill_loss import *
from .distill_model import *
from .ofa import *
from .prune import *
from .quant import *
from .unstructured_prune import *

import yaml
from ppdet.core.workspace import load_config
from ppdet.utils.checkpoint import load_pretrain_weight


def build_slim_model(cfg, slim_cfg, mode='train'):
    with open(slim_cfg) as f:
        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)

    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':
        return cfg

    if slim_load_cfg['slim'] == 'Distill':
        if "slim_method" in slim_load_cfg and slim_load_cfg[
                'slim_method'] == "FGD":
            model = FGDDistillModel(cfg, slim_cfg)
        elif "slim_method" in slim_load_cfg and slim_load_cfg[
                'slim_method'] == "LD":
            model = LDDistillModel(cfg, slim_cfg)
        elif "slim_method" in slim_load_cfg and slim_load_cfg[
                'slim_method'] == "CWD":
            model = CWDDistillModel(cfg, slim_cfg)
        elif "slim_method" in slim_load_cfg and slim_load_cfg[
                'slim_method'] == "PPYOLOEDistill":
            model = PPYOLOEDistillModel(cfg, slim_cfg)
        else:
            # common distillation model
            model = DistillModel(cfg, slim_cfg)
        cfg['model'] = model
        cfg['slim_type'] = cfg.slim
    elif slim_load_cfg['slim'] == 'OFA':
        load_config(slim_cfg)
        model = create(cfg.architecture)
        load_pretrain_weight(model, cfg.weights)
        slim = create(cfg.slim)
        cfg['slim'] = slim
        cfg['model'] = slim(model, model.state_dict())
        cfg['slim_type'] = cfg.slim
    elif slim_load_cfg['slim'] == 'DistillPrune':
        if mode == 'train':
            model = DistillModel(cfg, slim_cfg)
            pruner = create(cfg.pruner)
            pruner(model.student_model)
        else:
            model = create(cfg.architecture)
            weights = cfg.weights
            load_config(slim_cfg)
            pruner = create(cfg.pruner)
            model = pruner(model)
            load_pretrain_weight(model, weights)
        cfg['model'] = model
        cfg['slim_type'] = cfg.slim
    elif slim_load_cfg['slim'] == 'PTQ':
        model = create(cfg.architecture)
        load_config(slim_cfg)
        load_pretrain_weight(model, cfg.weights)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        cfg['slim'] = slim
        cfg['model'] = slim(model)
    elif slim_load_cfg['slim'] == 'UnstructuredPruner':
        load_config(slim_cfg)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        cfg['slim'] = slim
        cfg['unstructured_prune'] = True
    else:
        load_config(slim_cfg)
        model = create(cfg.architecture)
        if mode == 'train':
            load_pretrain_weight(model, cfg.pretrain_weights)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        # TODO: fix quant export model in framework.
        if mode == 'test' and 'QAT' in slim_load_cfg['slim']:
            slim.quant_config['activation_preprocess_type'] = None
        cfg['model'] = slim(model)
        cfg['slim'] = slim
        if mode != 'train':
            load_pretrain_weight(cfg['model'], cfg.weights)

    return cfg


================================================
FILE: ppdet/slim/distill_loss.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr

from ppdet.core.workspace import register
from ppdet.modeling import ops
from ppdet.modeling.losses.iou_loss import GIoULoss
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'DistillYOLOv3Loss',
    'KnowledgeDistillationKLDivLoss',
    'DistillPPYOLOELoss',
    'FGDFeatureLoss',
    'CWDFeatureLoss',
    'PKDFeatureLoss',
    'MGDFeatureLoss',
]


def parameter_init(mode="kaiming", value=0.):
    if mode == "kaiming":
        weight_attr = paddle.nn.initializer.KaimingUniform()
    elif mode == "constant":
        weight_attr = paddle.nn.initializer.Constant(value=value)
    else:
        weight_attr = paddle.nn.initializer.KaimingUniform()

    weight_init = ParamAttr(initializer=weight_attr)
    return weight_init


def feature_norm(feat):
    # Normalize the feature maps to have zero mean and unit variances.
    assert len(feat.shape) == 4
    N, C, H, W = feat.shape
    feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1])
    mean = feat.mean(axis=-1, keepdim=True)
    std = feat.std(axis=-1, keepdim=True)
    feat = (feat - mean) / (std + 1e-6)
    return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3])


@register
class DistillYOLOv3Loss(nn.Layer):
    def __init__(self, weight=1000):
        super(DistillYOLOv3Loss, self).__init__()
        self.loss_weight = weight

    def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj):
        loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx))
        loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty))
        loss_w = paddle.abs(sw - tw)
        loss_h = paddle.abs(sh - th)
        loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h])
        weighted_loss = paddle.mean(loss * F.sigmoid(tobj))
        return weighted_loss

    def obj_weighted_cls(self, scls, tcls, tobj):
        loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls))
        weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj)))
        return weighted_loss

    def obj_loss(self, sobj, tobj):
        obj_mask = paddle.cast(tobj > 0., dtype="float32")
        obj_mask.stop_gradient = True
        loss = paddle.mean(
            ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask))
        return loss

    def forward(self, teacher_model, student_model):
        teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs
        student_distill_pairs = student_model.yolo_head.loss.distill_pairs
        distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], []
        for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs):
            distill_reg_loss.append(
                self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[
                    3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4]))
            distill_cls_loss.append(
                self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4]))
            distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4]))
        distill_reg_loss = paddle.add_n(distill_reg_loss)
        distill_cls_loss = paddle.add_n(distill_cls_loss)
        distill_obj_loss = paddle.add_n(distill_obj_loss)
        loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss
                ) * self.loss_weight
        return loss


@register
class KnowledgeDistillationKLDivLoss(nn.Layer):
    """Loss function for knowledge distilling using KL divergence.

    Args:
        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
        loss_weight (float): Loss weight of current loss.
        T (int): Temperature for distillation.
    """

    def __init__(self, reduction='mean', loss_weight=1.0, T=10):
        super(KnowledgeDistillationKLDivLoss, self).__init__()
        assert reduction in ('none', 'mean', 'sum')
        assert T >= 1
        self.reduction = reduction
        self.loss_weight = loss_weight
        self.T = T

    def knowledge_distillation_kl_div_loss(self,
                                           pred,
                                           soft_label,
                                           T,
                                           detach_target=True):
        r"""Loss function for knowledge distilling using KL divergence.

        Args:
            pred (Tensor): Predicted logits with shape (N, n + 1).
            soft_label (Tensor): Target logits with shape (N, N + 1).
            T (int): Temperature for distillation.
            detach_target (bool): Remove soft_label from automatic differentiation
        """
        assert pred.shape == soft_label.shape
        target = F.softmax(soft_label / T, axis=1)
        if detach_target:
            target = target.detach()

        kd_loss = F.kl_div(
            F.log_softmax(
                pred / T, axis=1), target, reduction='none').mean(1) * (T * T)

        return kd_loss

    def forward(self,
                pred,
                soft_label,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.

        Args:
            pred (Tensor): Predicted logits with shape (N, n + 1).
            soft_label (Tensor): Target logits with shape (N, N + 1).
            weight (Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None.
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')

        reduction = (reduction_override
                     if reduction_override else self.reduction)

        loss_kd_out = self.knowledge_distillation_kl_div_loss(
            pred, soft_label, T=self.T)

        if weight is not None:
            loss_kd_out = weight * loss_kd_out

        if avg_factor is None:
            if reduction == 'none':
                loss = loss_kd_out
            elif reduction == 'mean':
                loss = loss_kd_out.mean()
            elif reduction == 'sum':
                loss = loss_kd_out.sum()
        else:
            # if reduction is mean, then average the loss by avg_factor
            if reduction == 'mean':
                loss = loss_kd_out.sum() / avg_factor
            # if reduction is 'none', then do nothing, otherwise raise an error
            elif reduction != 'none':
                raise ValueError(
                    'avg_factor can not be used with reduction="sum"')

        loss_kd = self.loss_weight * loss
        return loss_kd


@register
class DistillPPYOLOELoss(nn.Layer):
    def __init__(
            self,
            loss_weight={'logits': 4.0,
                         'feat': 1.0},
            logits_distill=True,
            logits_loss_weight={'class': 1.0,
                                'iou': 2.5,
                                'dfl': 0.5},
            logits_ld_distill=False,
            logits_ld_params={'weight': 20000,
                              'T': 10},
            feat_distill=True,
            feat_distiller='fgd',
            feat_distill_place='neck_feats',
            teacher_width_mult=1.0,  # L
            student_width_mult=0.75,  # M
            feat_out_channels=[768, 384, 192]):
        super(DistillPPYOLOELoss, self).__init__()
        self.loss_weight_logits = loss_weight['logits']
        self.loss_weight_feat = loss_weight['feat']
        self.logits_distill = logits_distill
        self.logits_ld_distill = logits_ld_distill
        self.feat_distill = feat_distill

        if logits_distill and self.loss_weight_logits > 0:
            self.bbox_loss_weight = logits_loss_weight['iou']
            self.dfl_loss_weight = logits_loss_weight['dfl']
            self.qfl_loss_weight = logits_loss_weight['class']
            self.loss_bbox = GIoULoss()

        if logits_ld_distill:
            self.loss_kd = KnowledgeDistillationKLDivLoss(
                loss_weight=logits_ld_params['weight'], T=logits_ld_params['T'])

        if feat_distill and self.loss_weight_feat > 0:
            assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic']
            assert feat_distill_place in ['backbone_feats', 'neck_feats']
            self.feat_distill_place = feat_distill_place
            self.t_channel_list = [
                int(c * teacher_width_mult) for c in feat_out_channels
            ]
            self.s_channel_list = [
                int(c * student_width_mult) for c in feat_out_channels
            ]
            self.distill_feat_loss_modules = []
            for i in range(len(feat_out_channels)):
                if feat_distiller == 'cwd':
                    feat_loss_module = CWDFeatureLoss(
                        student_channels=self.s_channel_list[i],
                        teacher_channels=self.t_channel_list[i],
                        normalize=True)
                elif feat_distiller == 'fgd':
                    feat_loss_module = FGDFeatureLoss(
                        student_channels=self.s_channel_list[i],
                        teacher_channels=self.t_channel_list[i],
                        normalize=True,
                        alpha_fgd=0.00001,
                        beta_fgd=0.000005,
                        gamma_fgd=0.00001,
                        lambda_fgd=0.00000005)
                elif feat_distiller == 'pkd':
                    feat_loss_module = PKDFeatureLoss(
                        student_channels=self.s_channel_list[i],
                        teacher_channels=self.t_channel_list[i],
                        normalize=True,
                        resize_stu=True)
                elif feat_distiller == 'mgd':
                    feat_loss_module = MGDFeatureLoss(
                        student_channels=self.s_channel_list[i],
                        teacher_channels=self.t_channel_list[i],
                        normalize=True,
                        loss_func='ssim')
                elif feat_distiller == 'mimic':
                    feat_loss_module = MimicFeatureLoss(
                        student_channels=self.s_channel_list[i],
                        teacher_channels=self.t_channel_list[i],
                        normalize=True)
                else:
                    raise ValueError
                self.distill_feat_loss_modules.append(feat_loss_module)

    def quality_focal_loss(self,
                           pred_logits,
                           soft_target_logits,
                           beta=2.0,
                           use_sigmoid=False,
                           num_total_pos=None):
        if use_sigmoid:
            func = F.binary_cross_entropy_with_logits
            soft_target = F.sigmoid(soft_target_logits)
            pred_sigmoid = F.sigmoid(pred_logits)
            preds = pred_logits
        else:
            func = F.binary_cross_entropy
            soft_target = soft_target_logits
            pred_sigmoid = pred_logits
            preds = pred_sigmoid

        scale_factor = pred_sigmoid - soft_target
        loss = func(
            preds, soft_target, reduction='none') * scale_factor.abs().pow(beta)
        loss = loss.sum(1)

        if num_total_pos is not None:
            loss = loss.sum() / num_total_pos
        else:
            loss = loss.mean()
        return loss

    def bbox_loss(self, s_bbox, t_bbox, weight_targets=None):
        # [x,y,w,h]
        if weight_targets is not None:
            loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets)
            avg_factor = weight_targets.sum()
            loss = loss / avg_factor
        else:
            loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox))
        return loss

    def distribution_focal_loss(self,
                                pred_corners,
                                target_corners,
                                weight_targets=None):
        target_corners_label = F.softmax(target_corners, axis=-1)
        loss_dfl = F.cross_entropy(
            pred_corners,
            target_corners_label,
            soft_label=True,
            reduction='none')
        loss_dfl = loss_dfl.sum(1)

        if weight_targets is not None:
            loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1]))
            loss_dfl = loss_dfl.sum(-1) / weight_targets.sum()
        else:
            loss_dfl = loss_dfl.mean(-1)
        return loss_dfl / 4.0  # 4 direction

    def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes):
        num_pos = mask_positive.sum()
        if num_pos > 0:
            cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes])
            pred_scores_pos = paddle.masked_select(
                pred_scores, cls_mask).reshape([-1, num_classes])
            soft_cls_pos = paddle.masked_select(
                soft_cls, cls_mask).reshape([-1, num_classes])
            loss_kd = self.loss_kd(
                pred_scores_pos, soft_cls_pos, avg_factor=num_pos)
        else:
            loss_kd = paddle.zeros([])
        return loss_kd

    def forward(self, teacher_model, student_model):
        teacher_distill_pairs = teacher_model.yolo_head.distill_pairs
        student_distill_pairs = student_model.yolo_head.distill_pairs
        if self.logits_distill and self.loss_weight_logits > 0:
            distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], []

            distill_cls_loss.append(
                self.quality_focal_loss(
                    student_distill_pairs['pred_cls_scores'].reshape(
                        (-1, student_distill_pairs['pred_cls_scores'].shape[-1]
                         )),
                    teacher_distill_pairs['pred_cls_scores'].detach().reshape(
                        (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1]
                         )),
                    num_total_pos=student_distill_pairs['pos_num'],
                    use_sigmoid=False))

            distill_bbox_loss.append(
                self.bbox_loss(student_distill_pairs['pred_bboxes_pos'],
                                teacher_distill_pairs['pred_bboxes_pos'].detach(),
                                weight_targets=student_distill_pairs['bbox_weight']
                    ) if 'pred_bboxes_pos' in student_distill_pairs and \
                        'pred_bboxes_pos' in teacher_distill_pairs and \
                            'bbox_weight' in student_distill_pairs
                    else paddle.zeros([]))

            distill_dfl_loss.append(
                self.distribution_focal_loss(
                        student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])),
                        teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \
                        weight_targets=student_distill_pairs['bbox_weight']
                    ) if 'pred_dist_pos' in student_distill_pairs and \
                        'pred_dist_pos' in teacher_distill_pairs and \
                            'bbox_weight' in student_distill_pairs
                    else paddle.zeros([]))

            distill_cls_loss = paddle.add_n(distill_cls_loss)
            distill_bbox_loss = paddle.add_n(distill_bbox_loss)
            distill_dfl_loss = paddle.add_n(distill_dfl_loss)
            logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight

            if self.logits_ld_distill:
                loss_kd = self.main_kd(
                    student_distill_pairs['mask_positive_select'],
                    student_distill_pairs['pred_cls_scores'],
                    teacher_distill_pairs['pred_cls_scores'],
                    student_model.yolo_head.num_classes, )
                logits_loss += loss_kd
        else:
            logits_loss = paddle.zeros([])

        if self.feat_distill and self.loss_weight_feat > 0:
            feat_loss_list = []
            inputs = student_model.inputs
            assert 'gt_bbox' in inputs
            assert self.feat_distill_place in student_distill_pairs
            assert self.feat_distill_place in teacher_distill_pairs
            stu_feats = student_distill_pairs[self.feat_distill_place]
            tea_feats = teacher_distill_pairs[self.feat_distill_place]
            for i, loss_module in enumerate(self.distill_feat_loss_modules):
                feat_loss_list.append(
                    loss_module(stu_feats[i], tea_feats[i], inputs))
            feat_loss = paddle.add_n(feat_loss_list)
        else:
            feat_loss = paddle.zeros([])

        student_model.yolo_head.distill_pairs.clear()
        teacher_model.yolo_head.distill_pairs.clear()
        return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat


@register
class CWDFeatureLoss(nn.Layer):
    def __init__(self,
                 student_channels,
                 teacher_channels,
                 normalize=False,
                 tau=1.0,
                 weight=1.0):
        super(CWDFeatureLoss, self).__init__()
        self.normalize = normalize
        self.tau = tau
        self.loss_weight = weight

        if student_channels != teacher_channels:
            self.align = nn.Conv2D(
                student_channels,
                teacher_channels,
                kernel_size=1,
                stride=1,
                padding=0)
        else:
            self.align = None

    def distill_softmax(self, x, tau):
        _, _, w, h = x.shape
        x = paddle.reshape(x, [-1, w * h])
        x /= tau
        return F.softmax(x, axis=1)

    def forward(self, preds_s, preds_t, inputs=None):
        assert preds_s.shape[-2:] == preds_t.shape[-2:]
        N, C, H, W = preds_s.shape
        eps = 1e-5
        if self.align is not None:
            preds_s = self.align(preds_s)

        if self.normalize:
            preds_s = feature_norm(preds_s)
            preds_t = feature_norm(preds_t)

        softmax_pred_s = self.distill_softmax(preds_s, self.tau)
        softmax_pred_t = self.distill_softmax(preds_t, self.tau)

        loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) +
                          softmax_pred_t * paddle.log(eps + softmax_pred_t))
        return self.loss_weight * loss / (C * N)


@register
class FGDFeatureLoss(nn.Layer):
    """
    Focal and Global Knowledge Distillation for Detectors
    The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py
   
    Args:
        student_channels (int): The number of channels in the student's FPN feature map. Default to 256.
        teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256.
        normalize (bool): Whether to normalize the feature maps.
        temp (float, optional): The temperature coefficient. Defaults to 0.5.
        alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001
        beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005
        gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001
        lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005
    """

    def __init__(self,
                 student_channels,
                 teacher_channels,
                 normalize=False,
                 loss_weight=1.0,
                 temp=0.5,
                 alpha_fgd=0.001,
                 beta_fgd=0.0005,
                 gamma_fgd=0.001,
                 lambda_fgd=0.000005):
        super(FGDFeatureLoss, self).__init__()
        self.normalize = normalize
        self.loss_weight = loss_weight
        self.temp = temp
        self.alpha_fgd = alpha_fgd
        self.beta_fgd = beta_fgd
        self.gamma_fgd = gamma_fgd
        self.lambda_fgd = lambda_fgd
        kaiming_init = parameter_init("kaiming")
        zeros_init = parameter_init("constant", 0.0)

        if student_channels != teacher_channels:
            self.align = nn.Conv2D(
                student_channels,
                teacher_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                weight_attr=kaiming_init)
            student_channels = teacher_channels
        else:
            self.align = None

        self.conv_mask_s = nn.Conv2D(
            student_channels, 1, kernel_size=1, weight_attr=kaiming_init)
        self.conv_mask_t = nn.Conv2D(
            teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init)

        self.stu_conv_block = nn.Sequential(
            nn.Conv2D(
                student_channels,
                student_channels // 2,
                kernel_size=1,
                weight_attr=zeros_init),
            nn.LayerNorm([student_channels // 2, 1, 1]),
            nn.ReLU(),
            nn.Conv2D(
                student_channels // 2,
                student_channels,
                kernel_size=1,
                weight_attr=zeros_init))
        self.tea_conv_block = nn.Sequential(
            nn.Conv2D(
                teacher_channels,
                teacher_channels // 2,
                kernel_size=1,
                weight_attr=zeros_init),
            nn.LayerNorm([teacher_channels // 2, 1, 1]),
            nn.ReLU(),
            nn.Conv2D(
                teacher_channels // 2,
                teacher_channels,
                kernel_size=1,
                weight_attr=zeros_init))

    def spatial_channel_attention(self, x, t=0.5):
        shape = x.shape
        N, C, H, W = shape
        _f = paddle.abs(x)
        spatial_map = paddle.reshape(
            paddle.mean(
                _f, axis=1, keepdim=True) / t, [N, -1])
        spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W
        spatial_att = paddle.reshape(spatial_map, [N, H, W])

        channel_map = paddle.mean(
            paddle.mean(
                _f, axis=2, keepdim=False), axis=2, keepdim=False)
        channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C
        return [spatial_att, channel_att]

    def spatial_pool(self, x, mode="teacher"):
        batch, channel, width, height = x.shape
        x_copy = x
        x_copy = paddle.reshape(x_copy, [batch, channel, height * width])
        x_copy = x_copy.unsqueeze(1)
        if mode.lower() == "student":
            context_mask = self.conv_mask_s(x)
        else:
            context_mask = self.conv_mask_t(x)

        context_mask = paddle.reshape(context_mask, [batch, 1, height * width])
        context_mask = F.softmax(context_mask, axis=2)
        context_mask = context_mask.unsqueeze(-1)
        context = paddle.matmul(x_copy, context_mask)
        context = paddle.reshape(context, [batch, channel, 1, 1])
        return context

    def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att,
                  tea_spatial_att):
        def _func(a, b):
            return paddle.sum(paddle.abs(a - b)) / len(a)

        mask_loss = _func(stu_channel_att, tea_channel_att) + _func(
            stu_spatial_att, tea_spatial_att)
        return mask_loss

    def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg,
                     tea_channel_att, tea_spatial_att):
        mask_fg = mask_fg.unsqueeze(axis=1)
        mask_bg = mask_bg.unsqueeze(axis=1)
        tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1)
        tea_spatial_att = tea_spatial_att.unsqueeze(axis=1)

        fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att))
        fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att))
        fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg))
        bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg))

        fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att))
        fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att))
        fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg))
        bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg))

        fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(mask_fg)
        bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(mask_bg)
        return fg_loss, bg_loss

    def relation_loss(self, stu_feature, tea_feature):
        context_s = self.spatial_pool(stu_feature, "student")
        context_t = self.spatial_pool(tea_feature, "teacher")
        out_s = stu_feature + self.stu_conv_block(context_s)
        out_t = tea_feature + self.tea_conv_block(context_t)
        rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s)
        return rela_loss

    def mask_value(self, mask, xl, xr, yl, yr, value):
        mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value)
        return mask

    def forward(self, stu_feature, tea_feature, inputs):
        assert stu_feature.shape[-2:] == stu_feature.shape[-2:]
        assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys()
        gt_bboxes = inputs['gt_bbox']
        ins_shape = [
            inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0])
        ]
        index_gt = []
        for i in range(len(gt_bboxes)):
            if gt_bboxes[i].size > 2:
                index_gt.append(i)
        # only distill feature with labeled GTbox
        if len(index_gt) != len(gt_bboxes):
            index_gt_t = paddle.to_tensor(index_gt)
            stu_feature = paddle.index_select(stu_feature, index_gt_t)
            tea_feature = paddle.index_select(tea_feature, index_gt_t)

            ins_shape = [ins_shape[c] for c in index_gt]
            gt_bboxes = [gt_bboxes[c] for c in index_gt]
            assert len(gt_bboxes) == tea_feature.shape[0]

        if self.align is not None:
            stu_feature = self.align(stu_feature)

        if self.normalize:
            stu_feature = feature_norm(stu_feature)
            tea_feature = feature_norm(tea_feature)

        tea_spatial_att, tea_channel_att = self.spatial_channel_attention(
            tea_feature, self.temp)
        stu_spatial_att, stu_channel_att = self.spatial_channel_attention(
            stu_feature, self.temp)

        mask_fg = paddle.zeros(tea_spatial_att.shape)
        mask_bg = paddle.ones_like(tea_spatial_att)
        one_tmp = paddle.ones([*tea_spatial_att.shape[1:]])
        zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]])
        mask_fg.stop_gradient = True
        mask_bg.stop_gradient = True
        one_tmp.stop_gradient = True
        zero_tmp.stop_gradient = True

        wmin, wmax, hmin, hmax = [], [], [], []

        if len(gt_bboxes) == 0:
            loss = self.relation_loss(stu_feature, tea_feature)
            return self.lambda_fgd * loss

        N, _, H, W = stu_feature.shape
        for i in range(N):
            tmp_box = paddle.ones_like(gt_bboxes[i])
            tmp_box.stop_gradient = True
            tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W
            tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W
            tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H
            tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H

            zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32")
            ones = paddle.ones_like(tmp_box[:, 2], dtype="int32")
            zero.stop_gradient = True
            ones.stop_gradient = True
            wmin.append(
                paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero))
            wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32"))
            hmin.append(
                paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero))
            hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32"))

            area_recip = 1.0 / (
                hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / (
                    wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1]))

            for j in range(len(gt_bboxes[i])):
                if gt_bboxes[i][j].sum() > 0:
                    mask_fg[i] = self.mask_value(
                        mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j],
                        wmax[i][j] + 1, area_recip[0][j])

            mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp)

            if paddle.sum(mask_bg[i]):
                mask_bg[i] /= paddle.sum(mask_bg[i])

        fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg,
                                             mask_bg, tea_channel_att,
                                             tea_spatial_att)
        mask_loss = self.mask_loss(stu_channel_att, tea_channel_att,
                                   stu_spatial_att, tea_spatial_att)
        rela_loss = self.relation_loss(stu_feature, tea_feature)
        loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \
               + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss
        return loss * self.loss_weight


@register
class PKDFeatureLoss(nn.Layer):
    """
    PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient.

    Args:
        loss_weight (float): Weight of loss. Defaults to 1.0.
        resize_stu (bool): If True, we'll down/up sample the features of the
            student model to the spatial size of those of the teacher model if
            their spatial sizes are different. And vice versa. Defaults to
            True.
    """

    def __init__(self,
                 student_channels=256,
                 teacher_channels=256,
                 normalize=True,
                 loss_weight=1.0,
                 resize_stu=True):
        super(PKDFeatureLoss, self).__init__()
        self.normalize = normalize
        self.loss_weight = loss_weight
        self.resize_stu = resize_stu

    def forward(self, stu_feature, tea_feature, inputs=None):
        size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:]
        if size_s[0] != size_t[0]:
            if self.resize_stu:
                stu_feature = F.interpolate(
                    stu_feature, size_t, mode='bilinear')
            else:
                tea_feature = F.interpolate(
                    tea_feature, size_s, mode='bilinear')
        assert stu_feature.shape == tea_feature.shape

        if self.normalize:
            stu_feature = feature_norm(stu_feature)
            tea_feature = feature_norm(tea_feature)

        loss = F.mse_loss(stu_feature, tea_feature) / 2
        return loss * self.loss_weight


@register
class MimicFeatureLoss(nn.Layer):
    def __init__(self,
                 student_channels=256,
                 teacher_channels=256,
                 normalize=True,
                 loss_weight=1.0):
        super(MimicFeatureLoss, self).__init__()
        self.normalize = normalize
        self.loss_weight = loss_weight
        self.mse_loss = nn.MSELoss()

        if student_channels != teacher_channels:
            self.align = nn.Conv2D(
                student_channels,
                teacher_channels,
                kernel_size=1,
                stride=1,
                padding=0)
        else:
            self.align = None

    def forward(self, stu_feature, tea_feature, inputs=None):
        if self.align is not None:
            stu_feature = self.align(stu_feature)

        if self.normalize:
            stu_feature = feature_norm(stu_feature)
            tea_feature = feature_norm(tea_feature)

        loss = self.mse_loss(stu_feature, tea_feature)
        return loss * self.loss_weight


@register
class MGDFeatureLoss(nn.Layer):
    def __init__(self,
                 student_channels=256,
                 teacher_channels=256,
                 normalize=True,
                 loss_weight=1.0,
                 loss_func='mse'):
        super(MGDFeatureLoss, self).__init__()
        self.normalize = normalize
        self.loss_weight = loss_weight
        assert loss_func in ['mse', 'ssim']
        self.loss_func = loss_func
        self.mse_loss = nn.MSELoss(reduction='sum')
        self.ssim_loss = SSIM(11)

        kaiming_init = parameter_init("kaiming")
        if student_channels != teacher_channels:
            self.align = nn.Conv2D(
                student_channels,
                teacher_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                weight_attr=kaiming_init,
                bias_attr=False)
        else:
            self.align = None

        self.generation = nn.Sequential(
            nn.Conv2D(
                teacher_channels, teacher_channels, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2D(
                teacher_channels, teacher_channels, kernel_size=3, padding=1))

    def forward(self, stu_feature, tea_feature, inputs=None):
        N = stu_feature.shape[0]
        if self.align is not None:
            stu_feature = self.align(stu_feature)
        stu_feature = self.generation(stu_feature)

        if self.normalize:
            stu_feature = feature_norm(stu_feature)
            tea_feature = feature_norm(tea_feature)

        if self.loss_func == 'mse':
            loss = self.mse_loss(stu_feature, tea_feature) / N
        elif self.loss_func == 'ssim':
            ssim_loss = self.ssim_loss(stu_feature, tea_feature)
            loss = paddle.clip((1 - ssim_loss) / 2, 0, 1)
        else:
            raise ValueError
        return loss * self.loss_weight


class SSIM(nn.Layer):
    def __init__(self, window_size=11, size_average=True):
        super(SSIM, self).__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.channel = 1
        self.window = self.create_window(window_size, self.channel)

    def gaussian(self, window_size, sigma):
        gauss = paddle.to_tensor([
            math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
            for x in range(window_size)
        ])
        return gauss / gauss.sum()

    def create_window(self, window_size, channel):
        _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1)
        _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0)
        window = _2D_window.expand([channel, 1, window_size, window_size])
        return window

    def _ssim(self, img1, img2, window, window_size, channel,
              size_average=True):
        mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
        mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
        mu1_sq = mu1.pow(2)
        mu2_sq = mu2.pow(2)
        mu1_mu2 = mu1 * mu2

        sigma1_sq = F.conv2d(
            img1 * img1, window, padding=window_size // 2,
            groups=channel) - mu1_sq
        sigma2_sq = F.conv2d(
            img2 * img2, window, padding=window_size // 2,
            groups=channel) - mu2_sq
        sigma12 = F.conv2d(
            img1 * img2, window, padding=window_size // 2,
            groups=channel) - mu1_mu2

        C1 = 0.01**2
        C2 = 0.03**2
        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
            1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

        if size_average:
            return ssim_map.mean()
        else:
            return ssim_map.mean([1, 2, 3])

    def forward(self, img1, img2):
        channel = img1.shape[1]
        if channel == self.channel and self.window.dtype == img1.dtype:
            window = self.window
        else:
            window = self.create_window(self.window_size, channel)
            self.window = window
            self.channel = channel

        return self._ssim(img1, img2, window, self.window_size, channel,
                          self.size_average)


================================================
FILE: ppdet/slim/distill_model.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn

from ppdet.core.workspace import register, create, load_config
from ppdet.utils.checkpoint import load_pretrain_weight
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'DistillModel',
    'FGDDistillModel',
    'CWDDistillModel',
    'LDDistillModel',
    'PPYOLOEDistillModel',
]


@register
class DistillModel(nn.Layer):
    """
    Build common distill model.
    Args:
        cfg: The student config.
        slim_cfg: The teacher and distill config.
    """

    def __init__(self, cfg, slim_cfg):
        super(DistillModel, self).__init__()
        self.arch = cfg.architecture

        self.stu_cfg = cfg
        self.student_model = create(self.stu_cfg.architecture)
        if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights:
            stu_pretrain = self.stu_cfg.pretrain_weights
        else:
            stu_pretrain = None

        slim_cfg = load_config(slim_cfg)
        self.tea_cfg = slim_cfg
        self.teacher_model = create(self.tea_cfg.architecture)
        if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights:
            tea_pretrain = self.tea_cfg.pretrain_weights
        else:
            tea_pretrain = None
        self.distill_cfg = slim_cfg

        # load pretrain weights
        self.is_inherit = False
        if stu_pretrain:
            if self.is_inherit and tea_pretrain:
                load_pretrain_weight(self.student_model, tea_pretrain)
                logger.debug(
                    "Inheriting! loading teacher weights to student model!")
            load_pretrain_weight(self.student_model, stu_pretrain)
            logger.info("Student model has loaded pretrain weights!")
        if tea_pretrain:
            load_pretrain_weight(self.teacher_model, tea_pretrain)
            logger.info("Teacher model has loaded pretrain weights!")

        self.teacher_model.eval()
        for param in self.teacher_model.parameters():
            param.trainable = False

        self.distill_loss = self.build_loss(self.distill_cfg)

    def build_loss(self, distill_cfg):
        if 'distill_loss' in distill_cfg and distill_cfg.distill_loss:
            return create(distill_cfg.distill_loss)
        else:
            return None

    def parameters(self):
        return self.student_model.parameters()

    def forward(self, inputs):
        if self.training:
            student_loss = self.student_model(inputs)
            with paddle.no_grad():
                teacher_loss = self.teacher_model(inputs)

            loss = self.distill_loss(self.teacher_model, self.student_model)
            student_loss['distill_loss'] = loss
            student_loss['teacher_loss'] = teacher_loss['loss']
            student_loss['loss'] += student_loss['distill_loss']
            return student_loss
        else:
            return self.student_model(inputs)


@register
class FGDDistillModel(DistillModel):
    """
    Build FGD distill model.
    Args:
        cfg: The student config.
        slim_cfg: The teacher and distill config.
    """

    def __init__(self, cfg, slim_cfg):
        super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
        assert self.arch in ['RetinaNet', 'PicoDet'
                             ], 'Unsupported arch: {}'.format(self.arch)
        self.is_inherit = True

    def build_loss(self, distill_cfg):
        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
        loss_func = dict()
        name_list = distill_cfg.distill_loss_name
        for name in name_list:
            loss_func[name] = create(distill_cfg.distill_loss)
        return loss_func

    def forward(self, inputs):
        if self.training:
            s_body_feats = self.student_model.backbone(inputs)
            s_neck_feats = self.student_model.neck(s_body_feats)
            with paddle.no_grad():
                t_body_feats = self.teacher_model.backbone(inputs)
                t_neck_feats = self.teacher_model.neck(t_body_feats)

            loss_dict = {}
            for idx, k in enumerate(self.distill_loss):
                loss_dict[k] = self.distill_loss[k](s_neck_feats[idx],
                                                    t_neck_feats[idx], inputs)
            if self.arch == "RetinaNet":
                loss = self.student_model.head(s_neck_feats, inputs)
            elif self.arch == "PicoDet":
                head_outs = self.student_model.head(
                    s_neck_feats, self.student_model.export_post_process)
                loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
                total_loss = paddle.add_n(list(loss_gfl.values()))
                loss = {}
                loss.update(loss_gfl)
                loss.update({'loss': total_loss})
            else:
                raise ValueError(f"Unsupported model {self.arch}")

            for k in loss_dict:
                loss['loss'] += loss_dict[k]
                loss[k] = loss_dict[k]
            return loss
        else:
            body_feats = self.student_model.backbone(inputs)
            neck_feats = self.student_model.neck(body_feats)
            head_outs = self.student_model.head(neck_feats)
            if self.arch == "RetinaNet":
                bbox, bbox_num = self.student_model.head.post_process(
                    head_outs, inputs['im_shape'], inputs['scale_factor'])
                return {'bbox': bbox, 'bbox_num': bbox_num}
            elif self.arch == "PicoDet":
                head_outs = self.student_model.head(
                    neck_feats, self.student_model.export_post_process)
                scale_factor = inputs['scale_factor']
                bboxes, bbox_num = self.student_model.head.post_process(
                    head_outs,
                    scale_factor,
                    export_nms=self.student_model.export_nms)
                return {'bbox': bboxes, 'bbox_num': bbox_num}
            else:
                raise ValueError(f"Unsupported model {self.arch}")


@register
class CWDDistillModel(DistillModel):
    """                                                                                                                                                    
    Build CWD distill model.                                                                                                                               
    Args:                                                                                                                                                  
        cfg: The student config.                                                                                                                           
        slim_cfg: The teacher and distill config.                                                                                                          
    """

    def __init__(self, cfg, slim_cfg):
        super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
        assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format(
            self.arch)

    def build_loss(self, distill_cfg):
        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
        loss_func = dict()
        name_list = distill_cfg.distill_loss_name
        for name in name_list:
            loss_func[name] = create(distill_cfg.distill_loss)
        return loss_func

    def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs):
        loss = self.student_model.head(stu_fea_list, inputs)
        loss_dict = {}
        for idx, k in enumerate(self.distill_loss):
            loss_dict[k] = self.distill_loss[k](stu_fea_list[idx],
                                                tea_fea_list[idx])

            loss['loss'] += loss_dict[k]
            loss[k] = loss_dict[k]
        return loss

    def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs):
        loss = {}
        head_outs = self.student_model.head(stu_fea_list)
        loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
        loss.update(loss_gfl)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})

        feat_loss = {}
        loss_dict = {}
        s_cls_feat, t_cls_feat = [], []
        for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list):
            conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f)
            cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat)
            t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f)
            t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat)
            s_cls_feat.append(cls_score)
            t_cls_feat.append(t_cls_score)

        for idx, k in enumerate(self.distill_loss):
            loss_dict[k] = self.distill_loss[k](s_cls_feat[idx],
                                                t_cls_feat[idx])
            feat_loss[f"neck_f_{idx}"] = self.distill_loss[k](stu_fea_list[idx],
                                                              tea_fea_list[idx])

        for k in feat_loss:
            loss['loss'] += feat_loss[k]
            loss[k] = feat_loss[k]

        for k in loss_dict:
            loss['loss'] += loss_dict[k]
            loss[k] = loss_dict[k]
        return loss

    def forward(self, inputs):
        if self.training:
            s_body_feats = self.student_model.backbone(inputs)
            s_neck_feats = self.student_model.neck(s_body_feats)
            with paddle.no_grad():
                t_body_feats = self.teacher_model.backbone(inputs)
                t_neck_feats = self.teacher_model.neck(t_body_feats)

            if self.arch == "RetinaNet":
                loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats,
                                               inputs)
            elif self.arch == "GFL":
                loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs)
            else:
                raise ValueError(f"unsupported arch {self.arch}")
            return loss
        else:
            body_feats = self.student_model.backbone(inputs)
            neck_feats = self.student_model.neck(body_feats)
            head_outs = self.student_model.head(neck_feats)
            if self.arch == "RetinaNet":
                bbox, bbox_num = self.student_model.head.post_process(
                    head_outs, inputs['im_shape'], inputs['scale_factor'])
                return {'bbox': bbox, 'bbox_num': bbox_num}
            elif self.arch == "GFL":
                bbox_pred, bbox_num = head_outs
                output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
                return output
            else:
                raise ValueError(f"unsupported arch {self.arch}")


@register
class LDDistillModel(DistillModel):
    """
    Build LD distill model.
    Args:
        cfg: The student config.
        slim_cfg: The teacher and distill config.
    """

    def __init__(self, cfg, slim_cfg):
        super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
        assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch)

    def forward(self, inputs):
        if self.training:
            s_body_feats = self.student_model.backbone(inputs)
            s_neck_feats = self.student_model.neck(s_body_feats)
            s_head_outs = self.student_model.head(s_neck_feats)
            with paddle.no_grad():
                t_body_feats = self.teacher_model.backbone(inputs)
                t_neck_feats = self.teacher_model.neck(t_body_feats)
                t_head_outs = self.teacher_model.head(t_neck_feats)

            soft_label_list = t_head_outs[0]
            soft_targets_list = t_head_outs[1]
            student_loss = self.student_model.head.get_loss(
                s_head_outs, inputs, soft_label_list, soft_targets_list)
            total_loss = paddle.add_n(list(student_loss.values()))
            student_loss['loss'] = total_loss
            return student_loss
        else:
            return self.student_model(inputs)


@register
class PPYOLOEDistillModel(DistillModel):
    """
    Build PPYOLOE distill model, only used in PPYOLOE
    Args:
        cfg: The student config.
        slim_cfg: The teacher and distill config.
    """

    def __init__(self, cfg, slim_cfg):
        super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
        assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format(
            self.arch)

    def forward(self, inputs, alpha=0.125):
        if self.training:
            with paddle.no_grad():
                teacher_loss = self.teacher_model(inputs)
            if hasattr(self.teacher_model.yolo_head, "assigned_labels"):
                self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \
                    self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores
                delattr(self.teacher_model.yolo_head, "assigned_labels")
                delattr(self.teacher_model.yolo_head, "assigned_bboxes")
                delattr(self.teacher_model.yolo_head, "assigned_scores")
            student_loss = self.student_model(inputs)

            logits_loss, feat_loss = self.distill_loss(self.teacher_model,
                                                       self.student_model)
            det_total_loss = student_loss['loss']
            total_loss = alpha * (det_total_loss + logits_loss + feat_loss)
            student_loss['loss'] = total_loss
            student_loss['det_loss'] = det_total_loss
            student_loss['logits_loss'] = logits_loss
            student_loss['feat_loss'] = feat_loss
            return student_loss
        else:
            return self.student_model(inputs)


================================================
FILE: ppdet/slim/ofa.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from ppdet.core.workspace import load_config, merge_config, create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from ppdet.utils.logger import setup_logger
from ppdet.core.workspace import register, serializable

from paddle.utils import try_import

logger = setup_logger(__name__)


@register
@serializable
class OFA(object):
    def __init__(self, ofa_config):
        super(OFA, self).__init__()
        self.ofa_config = ofa_config

    def __call__(self, model, param_state_dict):

        paddleslim = try_import('paddleslim')
        from paddleslim.nas.ofa import OFA, RunConfig, utils
        from paddleslim.nas.ofa.convert_super import Convert, supernet
        task = self.ofa_config['task']
        expand_ratio = self.ofa_config['expand_ratio']

        skip_neck = self.ofa_config['skip_neck']
        skip_head = self.ofa_config['skip_head']

        run_config = self.ofa_config['RunConfig']
        if 'skip_layers' in run_config:
            skip_layers = run_config['skip_layers']
        else:
            skip_layers = []

        # supernet config
        sp_config = supernet(expand_ratio=expand_ratio)
        # convert to supernet
        model = Convert(sp_config).convert(model)

        skip_names = []
        if skip_neck:
            skip_names.append('neck.')
        if skip_head:
            skip_names.append('head.')

        for name, sublayer in model.named_sublayers():
            for n in skip_names:
                if n in name:
                    skip_layers.append(name)

        run_config['skip_layers'] = skip_layers
        run_config = RunConfig(**run_config)

        # build ofa model
        ofa_model = OFA(model, run_config=run_config)

        ofa_model.set_epoch(0)
        ofa_model.set_task(task)

        input_spec = [{
            "image": paddle.ones(
                shape=[1, 3, 640, 640], dtype='float32'),
            "im_shape": paddle.full(
                [1, 2], 640, dtype='float32'),
            "scale_factor": paddle.ones(
                shape=[1, 2], dtype='float32')
        }]

        ofa_model._clear_search_space(input_spec=input_spec)
        ofa_model._build_ss = True
        check_ss = ofa_model._sample_config('expand_ratio', phase=None)
        # tokenize the search space
        ofa_model.tokenize()
        # check token map, search cands and search space
        logger.info('Token map is {}'.format(ofa_model.token_map))
        logger.info('Search candidates is {}'.format(ofa_model.search_cands))
        logger.info('The length of search_space is {}, search_space is {}'.
                    format(len(ofa_model._ofa_layers), ofa_model._ofa_layers))
        # set model state dict into ofa model
        utils.set_state_dict(ofa_model.model, param_state_dict)
        return ofa_model


================================================
FILE: ppdet/slim/prune.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from paddle.utils import try_import

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


def print_prune_params(model):
    model_dict = model.state_dict()
    for key in model_dict.keys():
        weight_name = model_dict[key].name
        logger.info('Parameter name: {}, shape: {}'.format(
            weight_name, model_dict[key].shape))


@register
@serializable
class Pruner(object):
    def __init__(self,
                 criterion,
                 pruned_params,
                 pruned_ratios,
                 print_params=False):
        super(Pruner, self).__init__()
        assert criterion in ['l1_norm', 'fpgm'], \
            "unsupported prune criterion: {}".format(criterion)
        self.criterion = criterion
        self.pruned_params = pruned_params
        self.pruned_ratios = pruned_ratios
        self.print_params = print_params

    def __call__(self, model):
        # FIXME: adapt to network graph when Training and inference are
        # inconsistent, now only supports prune inference network graph.
        model.eval()
        paddleslim = try_import('paddleslim')
        from paddleslim.analysis import dygraph_flops as flops
        input_spec = [{
            "image": paddle.ones(
                shape=[1, 3, 640, 640], dtype='float32'),
            "im_shape": paddle.full(
                [1, 2], 640, dtype='float32'),
            "scale_factor": paddle.ones(
                shape=[1, 2], dtype='float32')
        }]
        if self.print_params:
            print_prune_params(model)

        ori_flops = flops(model, input_spec) / (1000**3)
        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
        if self.criterion == 'fpgm':
            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
        elif self.criterion == 'l1_norm':
            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)

        logger.info("pruned params: {}".format(self.pruned_params))
        pruned_ratios = [float(n) for n in self.pruned_ratios]
        ratios = {}
        for i, param in enumerate(self.pruned_params):
            ratios[param] = pruned_ratios[i]
        pruner.prune_vars(ratios, [0])
        pruned_flops = flops(model, input_spec) / (1000**3)
        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
            pruned_flops, (ori_flops - pruned_flops) / ori_flops))

        return model


@register
@serializable
class PrunerQAT(object):
    def __init__(self, criterion, pruned_params, pruned_ratios,
                 print_prune_params, quant_config, print_qat_model):
        super(PrunerQAT, self).__init__()
        assert criterion in ['l1_norm', 'fpgm'], \
            "unsupported prune criterion: {}".format(criterion)
        # Pruner hyperparameter
        self.criterion = criterion
        self.pruned_params = pruned_params
        self.pruned_ratios = pruned_ratios
        self.print_prune_params = print_prune_params
        # QAT hyperparameter
        self.quant_config = quant_config
        self.print_qat_model = print_qat_model

    def __call__(self, model):
        # FIXME: adapt to network graph when Training and inference are
        # inconsistent, now only supports prune inference network graph.
        model.eval()
        paddleslim = try_import('paddleslim')
        from paddleslim.analysis import dygraph_flops as flops
        input_spec = [{
            "image": paddle.ones(
                shape=[1, 3, 640, 640], dtype='float32'),
            "im_shape": paddle.full(
                [1, 2], 640, dtype='float32'),
            "scale_factor": paddle.ones(
                shape=[1, 2], dtype='float32')
        }]
        if self.print_prune_params:
            print_prune_params(model)

        ori_flops = flops(model, input_spec) / 1000
        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
        if self.criterion == 'fpgm':
            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
        elif self.criterion == 'l1_norm':
            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)

        logger.info("pruned params: {}".format(self.pruned_params))
        pruned_ratios = [float(n) for n in self.pruned_ratios]
        ratios = {}
        for i, param in enumerate(self.pruned_params):
            ratios[param] = pruned_ratios[i]
        pruner.prune_vars(ratios, [0])
        pruned_flops = flops(model, input_spec) / 1000
        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
            pruned_flops, (ori_flops - pruned_flops) / ori_flops))

        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)

        self.quanter.quantize(model)

        if self.print_qat_model:
            logger.info("Quantized model:")
            logger.info(model)

        return model

    def save_quantized_model(self, layer, path, input_spec=None, **config):
        self.quanter.save_quantized_model(
            model=layer, path=path, input_spec=input_spec, **config)


================================================
FILE: ppdet/slim/quant.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from paddle.utils import try_import

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@register
@serializable
class QAT(object):
    def __init__(self, quant_config, print_model):
        super(QAT, self).__init__()
        self.quant_config = quant_config
        self.print_model = print_model

    def __call__(self, model):
        paddleslim = try_import('paddleslim')
        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)
        if self.print_model:
            logger.info("Model before quant:")
            logger.info(model)

        # For PP-YOLOE, convert model to deploy firstly.
        for layer in model.sublayers():
            if hasattr(layer, 'convert_to_deploy'):
                layer.convert_to_deploy()

        self.quanter.quantize(model)

        if self.print_model:
            logger.info("Quantized model:")
            logger.info(model)

        return model

    def save_quantized_model(self, layer, path, input_spec=None, **config):
        self.quanter.save_quantized_model(
            model=layer, path=path, input_spec=input_spec, **config)


@register
@serializable
class PTQ(object):
    def __init__(self,
                 ptq_config,
                 quant_batch_num=10,
                 output_dir='output_inference',
                 fuse=True,
                 fuse_list=None):
        super(PTQ, self).__init__()
        self.ptq_config = ptq_config
        self.quant_batch_num = quant_batch_num
        self.output_dir = output_dir
        self.fuse = fuse
        self.fuse_list = fuse_list

    def __call__(self, model):
        paddleslim = try_import('paddleslim')
        self.ptq = paddleslim.PTQ(**self.ptq_config)
        model.eval()
        quant_model = self.ptq.quantize(
            model, fuse=self.fuse, fuse_list=self.fuse_list)

        return quant_model

    def save_quantized_model(self,
                             quant_model,
                             quantize_model_path,
                             input_spec=None):
        self.ptq.save_quantized_model(quant_model, quantize_model_path,
                                      input_spec)


================================================
FILE: ppdet/slim/unstructured_prune.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from paddle.utils import try_import

from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)


@register
@serializable
class UnstructuredPruner(object):
    def __init__(self,
                 stable_epochs,
                 pruning_epochs,
                 tunning_epochs,
                 pruning_steps,
                 ratio,
                 initial_ratio,
                 prune_params_type=None):
        self.stable_epochs = stable_epochs
        self.pruning_epochs = pruning_epochs
        self.tunning_epochs = tunning_epochs
        self.ratio = ratio
        self.prune_params_type = prune_params_type
        self.initial_ratio = initial_ratio
        self.pruning_steps = pruning_steps

    def __call__(self, model, steps_per_epoch, skip_params_func=None):
        paddleslim = try_import('paddleslim')
        from paddleslim import GMPUnstructuredPruner
        configs = {
            'pruning_strategy': 'gmp',
            'stable_iterations': self.stable_epochs * steps_per_epoch,
            'pruning_iterations': self.pruning_epochs * steps_per_epoch,
            'tunning_iterations': self.tunning_epochs * steps_per_epoch,
            'resume_iteration': 0,
            'pruning_steps': self.pruning_steps,
            'initial_ratio': self.initial_ratio,
        }

        pruner = GMPUnstructuredPruner(
            model,
            ratio=self.ratio,
            skip_params_func=skip_params_func,
            prune_params_type=self.prune_params_type,
            local_sparsity=True,
            configs=configs)

        return pruner


================================================
FILE: ppdet/utils/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppdet/utils/cam_utils.py
================================================
import numpy as np
import cv2
import os
import sys
import glob
from ppdet.utils.logger import setup_logger
import copy
logger = setup_logger('ppdet_cam')

import paddle
from ppdet.engine import Trainer


def get_test_images(infer_dir, infer_img):
    """
    Get image path list in TEST mode
    """
    assert infer_img is not None or infer_dir is not None, \
        "--infer_img or --infer_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), \
            "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), \
            "{} is not a directory".format(infer_dir)

    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]

    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), \
        "infer_dir {} is not a directory".format(infer_dir)
    exts = ['jpg', 'jpeg', 'png', 'bmp']
    exts += [ext.upper() for ext in exts]
    for ext in exts:
        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)

    assert len(images) > 0, "no image found in {}".format(infer_dir)
    logger.info("Found {} inference images in total.".format(len(images)))

    return images


def compute_ious(boxes1, boxes2):
    """[Compute pairwise IOU matrix for given two sets of boxes]

        Args:
            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
        Returns:
            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith
            box and jth box from box1 and box2 respectively.
    """
    lu = np.maximum(
        boxes1[:, None, :2], boxes2[:, :2]
    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu
    intersection_wh = np.maximum(0.0, rd - lu)
    intersection_area = intersection_wh[:, :,
                                        0] * intersection_wh[:, :,
                                                             1]  # with shape (N,M)
    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)
    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)
    union_area = np.maximum(
        boxes1_area[:, None] + boxes2_area - intersection_area,
        1e-8)  # with shape (N,M)
    ious = np.clip(intersection_area / union_area, 0.0, 1.0)
    return ious


def grad_cam(feat, grad):
    """

    Args:
        feat:  CxHxW
        grad:  CxHxW

    Returns:
           cam: HxW
    """
    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
    exp = np.maximum(-exp, 0)
    return exp


def resize_cam(explanation, resize_shape) -> np.ndarray:
    """

    Args:
        explanation: (width, height)
        resize_shape: (width, height)

    Returns:

    """
    assert len(explanation.shape) == 2, f"{explanation.shape}. " \
                                        f"Currently support 2D explanation results for visualization. " \
                                        "Reduce higher dimensions to 2D for visualization."

    explanation = (explanation - explanation.min()) / (
        explanation.max() - explanation.min())

    explanation = cv2.resize(explanation, resize_shape)
    explanation = np.uint8(255 * explanation)
    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)

    return explanation


class BBoxCAM:
    def __init__(self, FLAGS, cfg):
        self.FLAGS = FLAGS
        self.cfg = cfg
        # build model
        self.trainer = self.build_trainer(cfg)
        # num_class
        self.num_class = cfg.num_classes
        # set hook for extraction of featuremaps and grads
        self.set_hook(cfg)
        self.nms_idx_need_divid_numclass_arch = [
            'FasterRCNN', 'MaskRCNN', 'CascadeRCNN'
        ]
        """
        In these networks, the bbox array shape before nms contain num_class,
        the nms_keep_idx of the bbox need to divide the num_class; 
        """

        # cam image output_dir
        try:
            os.makedirs(FLAGS.cam_out)
        except:
            print('Path already exists.')
            pass

    def build_trainer(self, cfg):
        # build trainer
        trainer = Trainer(cfg, mode='test')
        # load weights
        trainer.load_weights(cfg.weights)

        # set for get extra_data before nms
        trainer.model.use_extra_data = True
        # set for record the bbox index before nms
        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
            trainer.model.bbox_post_process.nms.return_index = True
        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
            if trainer.model.post_process is not None:
                # anchor based YOLOs: YOLOv3,PP-YOLO
                trainer.model.post_process.nms.return_index = True
            else:
                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
                trainer.model.yolo_head.nms.return_index = True
        elif cfg.architecture == 'BlazeFace' or cfg.architecture == 'SSD':
            trainer.model.post_process.nms.return_index = True
        elif cfg.architecture == 'RetinaNet':
            trainer.model.head.nms.return_index = True
        else:
            print(cfg.architecture + ' is not supported for cam temporarily!')
            sys.exit()
        # Todo: Unify the head/post_process name in each model

        return trainer

    def set_hook(self, cfg):
        # set hook for extraction of featuremaps and grads
        self.target_feats = {}
        self.target_layer_name = cfg.target_feature_layer_name

        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor

        def hook(layer, input, output):
            self.target_feats[layer._layer_name_for_hook] = output

        try:
            exec('self.trainer.' + self.target_layer_name +
                 '._layer_name_for_hook = self.target_layer_name')
            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
            exec('self.trainer.' + self.target_layer_name +
                 '.register_forward_post_hook(hook)')
            # self.trainer.target_layer_name.register_forward_post_hook(hook)
        except:
            print("Error! "
                  "The target_layer_name--" + self.target_layer_name +
                  " is not in model! "
                  "Please check the spelling and "
                  "the network's architecture!")
            sys.exit()

    def get_bboxes(self):
        # get inference images
        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)

        # inference
        result = self.trainer.predict(
            images,
            draw_threshold=self.FLAGS.draw_threshold,
            output_dir=self.FLAGS.output_dir,
            save_results=self.FLAGS.save_results,
            visualize=False)[0]
        return result

    def get_bboxes_cams(self):
        # Get the bboxes prediction(after nms result) of the input
        inference_result = self.get_bboxes()

        # read input image
        # Todo: Support folder multi-images process
        from PIL import Image
        img = np.array(Image.open(self.cfg.infer_img))

        # data for calaulate bbox grad_cam
        extra_data = inference_result['extra_data']
        """
        Example of Faster_RCNN based architecture:
            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
                      }
        Example of YOLOv3 based architecture:
            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
                      }
        """

        # array index of the predicted bbox before nms
        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
            # we need to divide num_classes to get the before_nms_index；
            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
            ) // self.num_class  # num_class
        else:
            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()

        # Calculate and visualize the heatmap of per predict bbox
        for index, target_bbox in enumerate(inference_result['bbox']):
            # target_bbox: [cls, score, x1, y1, x2, y2]
            # filter bboxes with low predicted scores
            if target_bbox[1] < self.FLAGS.draw_threshold:
                continue

            target_bbox_before_nms = int(before_nms_indexes[index])

            if len(extra_data['scores'].shape) == 2:
                score_out = extra_data['scores'][target_bbox_before_nms]
            else:
                score_out = extra_data['scores'][0, :, target_bbox_before_nms]
            """
            There are two kinds array shape of bbox score output :
                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
            """

            # construct one_hot label and do backward to get the gradients
            predicted_label = paddle.argmax(score_out)
            label_onehot = paddle.nn.functional.one_hot(
                predicted_label, num_classes=len(score_out))
            label_onehot = label_onehot.squeeze()
            target = paddle.sum(score_out * label_onehot)
            target.backward(retain_graph=True)


            if 'backbone' in self.target_layer_name or \
                    'neck' in self.target_layer_name: # backbone/neck level feature
                if isinstance(self.target_feats[self.target_layer_name], list):
                    # when the featuremap contains of multiple scales,
                    # take the featuremap of the last scale
                    # Todo: fuse the cam result from multisclae featuremaps
                    if self.target_feats[self.target_layer_name][-1].shape[
                            -1] == 1:
                        """
                        if the last level featuremap is 1x1 size,
                        we take the second last one
                        """
                        cam_grad = self.target_feats[self.target_layer_name][
                            -2].grad.squeeze().cpu().numpy()
                        cam_feat = self.target_feats[self.target_layer_name][
                            -2].squeeze().cpu().numpy()
                    else:
                        cam_grad = self.target_feats[self.target_layer_name][
                            -1].grad.squeeze().cpu().numpy()
                        cam_feat = self.target_feats[self.target_layer_name][
                            -1].squeeze().cpu().numpy()
                else:
                    cam_grad = self.target_feats[
                        self.target_layer_name].grad.squeeze().cpu().numpy()
                    cam_feat = self.target_feats[
                        self.target_layer_name].squeeze().cpu().numpy()
            else:  # roi level feature
                cam_grad = self.target_feats[
                    self.target_layer_name].grad.squeeze().cpu().numpy()[
                        target_bbox_before_nms]
                cam_feat = self.target_feats[self.target_layer_name].squeeze(
                ).cpu().numpy()[target_bbox_before_nms]

            # grad_cam:
            exp = grad_cam(cam_feat, cam_grad)

            if 'backbone' in self.target_layer_name or \
                    'neck' in self.target_layer_name:
                """
                when use backbone/neck featuremap, 
                we first do the cam on whole image, 
                and then set the area outside the predic bbox to 0
                """
                # reshape the cam image to the input image size
                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
                mask = np.zeros((img.shape[0], img.shape[1], 3))
                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[
                    2]):int(target_bbox[4]), :] = 1
                resized_exp = resized_exp * mask
                # add the bbox cam back to the input image
                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
            elif 'roi' in self.target_layer_name:
                # get the bbox part of the image
                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(
                    target_bbox[5]), int(target_bbox[2]):int(target_bbox[
                        4]), :])
                # reshape the cam image to the bbox size
                resized_exp = resize_cam(exp,
                                         (bbox_img.shape[1], bbox_img.shape[0]))
                # add the bbox cam back to the bbox image
                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
                # put the bbox_cam image to the original image
                overlay_vis = copy.deepcopy(img)
                overlay_vis[int(target_bbox[3]):int(target_bbox[5]), int(
                    target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
            else:
                print(
                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
                )
                sys.exit()

            # put the bbox rectangle on image
            cv2.rectangle(
                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)

            # save visualization result
            cam_image = Image.fromarray(overlay_vis)
            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')

            # clear gradients after each bbox grad_cam
            target.clear_gradient()
            for n, v in self.trainer.model.named_sublayers():
                v.clear_gradients()


================================================
FILE: ppdet/utils/check.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys

import paddle
import six
import paddle.version as paddle_version

from .logger import setup_logger
logger = setup_logger(__name__)

__all__ = [
    'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version',
    'check_config'
]


def check_mlu(use_mlu):
    """
    Log error and exit when set use_mlu=true in paddlepaddle
    cpu/gpu/xpu/npu version.
    """
    err = "Config use_mlu cannot be set as true while you are " \
          "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \
          "\t1. Install paddlepaddle-mlu to run model on MLU \n" \
          "\t2. Set use_mlu as false in config file to run " \
          "model on CPU/GPU/XPU/NPU"

    try:
        if use_mlu and not paddle.is_compiled_with_mlu():
            logger.error(err)
            sys.exit(1)
    except Exception as e:
        pass


def check_npu(use_npu):
    """
    Log error and exit when set use_npu=true in paddlepaddle
    version without paddle-custom-npu installed.
    """
    err = "Config use_npu cannot be set as true while you are " \
          "using paddlepaddle version without paddle-custom-npu " \
          "installed! \nPlease try: \n" \
          "\t1. Install paddle-custom-npu to run model on NPU \n" \
          "\t2. Set use_npu as false in config file to run " \
          "model on other devices supported."

    try:
        if use_npu and not 'npu' in paddle.device.get_all_custom_device_type():
            logger.error(err)
            sys.exit(1)
    except Exception as e:
        pass


def check_xpu(use_xpu):
    """
    Log error and exit when set use_xpu=true in paddlepaddle
    cpu/gpu/npu version.
    """
    err = "Config use_xpu cannot be set as true while you are " \
          "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \
          "\t1. Install paddlepaddle-xpu to run model on XPU \n" \
          "\t2. Set use_xpu as false in config file to run " \
          "model on CPU/GPU/NPU"

    try:
        if use_xpu and not paddle.is_compiled_with_xpu():
            logger.error(err)
            sys.exit(1)
    except Exception as e:
        pass


def check_gpu(use_gpu):
    """
    Log error and exit when set use_gpu=true in paddlepaddle
    cpu version.
    """
    err = "Config use_gpu cannot be set as true while you are " \
          "using paddlepaddle cpu version ! \nPlease try: \n" \
          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
          "\t2. Set use_gpu as false in config file to run " \
          "model on CPU"

    try:
        if use_gpu and not paddle.is_compiled_with_cuda():
            logger.error(err)
            sys.exit(1)
    except Exception as e:
        pass


def check_version(version='2.2'):
    """
    Log error and exit when the installed version of paddlepaddle is
    not satisfied.
    """
    err = "PaddlePaddle version {} or higher is required, " \
          "or a suitable develop version is satisfied as well. \n" \
          "Please make sure the version is good with your code.".format(version)

    version_installed = [
        paddle_version.major, paddle_version.minor, paddle_version.patch,
        paddle_version.rc
    ]

    if version_installed == ['0', '0', '0', '0']:
        return

    version_split = version.split('.')

    length = min(len(version_installed), len(version_split))
    for i in six.moves.range(length):
        if version_installed[i] > version_split[i]:
            return
        if version_installed[i] < version_split[i]:
            raise Exception(err)


def check_config(cfg):
    """
    Check the correctness of the configuration file. Log error and exit
    when Config is not compliant.
    """
    err = "'{}' not specified in config file. Please set it in config file."
    check_list = ['architecture', 'num_classes']
    try:
        for var in check_list:
            if not var in cfg:
                logger.error(err.format(var))
                sys.exit(1)
    except Exception as e:
        pass

    if 'log_iter' not in cfg:
        cfg.log_iter = 20

    return cfg


================================================
FILE: ppdet/utils/checkpoint.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import json
import numpy as np
import paddle
import paddle.nn as nn
from .download import get_weights_path

from .logger import setup_logger
logger = setup_logger(__name__)

def convert_to_dict(obj):
    if isinstance(obj, dict):
        return {k: convert_to_dict(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_dict(i) for i in obj]
    else:
        return obj

def is_url(path):
    """
    Whether path is URL.
    Args:
        path (string): URL string or not.
    """
    return path.startswith('http://') \
            or path.startswith('https://') \
            or path.startswith('ppdet://')


def _strip_postfix(path):
    path, ext = os.path.splitext(path)
    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
            "Unknown postfix {} from weights".format(ext)
    return path


def load_weight(model, weight, optimizer=None, ema=None, exchange=True):
    if is_url(weight):
        weight = get_weights_path(weight)

    path = _strip_postfix(weight)
    pdparam_path = path + '.pdparams'
    if not os.path.exists(pdparam_path):
        raise ValueError("Model pretrain path {} does not "
                         "exists.".format(pdparam_path))

    if ema is not None and os.path.exists(path + '.pdema'):
        if exchange:
            # Exchange model and ema_model to load
            logger.info('Exchange model and ema_model to load:')
            ema_state_dict = paddle.load(pdparam_path)
            logger.info('Loading ema_model weights from {}'.format(path +
                                                                   '.pdparams'))
            param_state_dict = paddle.load(path + '.pdema')
            logger.info('Loading model weights from {}'.format(path + '.pdema'))
        else:
            ema_state_dict = paddle.load(path + '.pdema')
            logger.info('Loading ema_model weights from {}'.format(path +
                                                                   '.pdema'))
            param_state_dict = paddle.load(pdparam_path)
            logger.info('Loading model weights from {}'.format(path +
                                                               '.pdparams'))
    else:
        ema_state_dict = None
        param_state_dict = paddle.load(pdparam_path)

    if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
        print('Loading pretrain weights for Teacher-Student framework.')
        print('Loading pretrain weights for Student model.')
        student_model_dict = model.modelStudent.state_dict()
        student_param_state_dict = match_state_dict(
            student_model_dict, param_state_dict, mode='student')
        model.modelStudent.set_dict(student_param_state_dict)
        print('Loading pretrain weights for Teacher model.')
        teacher_model_dict = model.modelTeacher.state_dict()

        teacher_param_state_dict = match_state_dict(
            teacher_model_dict, param_state_dict, mode='teacher')
        model.modelTeacher.set_dict(teacher_param_state_dict)

    else:
        model_dict = model.state_dict()
        model_weight = {}
        incorrect_keys = 0
        for key in model_dict.keys():
            if key in param_state_dict.keys():
                model_weight[key] = param_state_dict[key]
            else:
                logger.info('Unmatched key: {}'.format(key))
                incorrect_keys += 1
        assert incorrect_keys == 0, "Load weight {} incorrectly, \
                {} keys unmatched, please check again.".format(weight,
                                                               incorrect_keys)
        logger.info('Finish resuming model weights: {}'.format(pdparam_path))
        model.set_dict(model_weight)

    last_epoch = 0
    if optimizer is not None and os.path.exists(path + '.pdopt'):
        optim_state_dict = paddle.load(path + '.pdopt')
        # to solve resume bug, will it be fixed in paddle 2.0
        for key in optimizer.state_dict().keys():
            if not key in optim_state_dict.keys():
                optim_state_dict[key] = optimizer.state_dict()[key]
        if 'last_epoch' in optim_state_dict:
            last_epoch = optim_state_dict.pop('last_epoch')
        optimizer.set_state_dict(optim_state_dict)

        if ema_state_dict is not None:
            ema.resume(ema_state_dict,
                       optim_state_dict['LR_Scheduler']['last_epoch'])
    elif ema_state_dict is not None:
        ema.resume(ema_state_dict)
    return last_epoch


def match_state_dict(model_state_dict, weight_state_dict, mode='default'):
    """
    Match between the model state dict and pretrained weight state dict.
    Return the matched state dict.

    The method supposes that all the names in pretrained weight state dict are
    subclass of the names in models`, if the prefix 'backbone.' in pretrained weight
    keys is stripped. And we could get the candidates for each model key. Then we
    select the name with the longest matched size as the final match result. For
    example, the model state dict has the name of
    'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as
    name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We
    match the 'res2.res2a.branch2a.conv.weight' to the model key.
    """

    model_keys = sorted(model_state_dict.keys())
    weight_keys = sorted(weight_state_dict.keys())

    def teacher_match(a, b):
        # skip student params
        if b.startswith('modelStudent'):
            return False
        return a == b or a.endswith("." + b) or b.endswith("." + a)

    def student_match(a, b):
        # skip teacher params
        if b.startswith('modelTeacher'):
            return False
        return a == b or a.endswith("." + b) or b.endswith("." + a)

    def match(a, b):
        if b.startswith('backbone.res5'):
            b = b[9:]
        return a == b or a.endswith("." + b)

    if mode == 'student':
        match_op = student_match
    elif mode == 'teacher':
        match_op = teacher_match
    else:
        match_op = match

    match_matrix = np.zeros([len(model_keys), len(weight_keys)])
    for i, m_k in enumerate(model_keys):
        for j, w_k in enumerate(weight_keys):
            if match_op(m_k, w_k):
                match_matrix[i, j] = len(w_k)
    max_id = match_matrix.argmax(1)
    max_len = match_matrix.max(1)
    max_id[max_len == 0] = -1
    load_id = set(max_id)
    load_id.discard(-1)
    not_load_weight_name = []
    if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith(
            'modelTeacher'):
        for match_idx in range(len(max_id)):
            if max_id[match_idx] == -1:
                not_load_weight_name.append(model_keys[match_idx])
        if len(not_load_weight_name) > 0:
            logger.info('{} in model is not matched with pretrained weights, '
                        'and its will be trained from scratch'.format(
                            not_load_weight_name))

    else:
        for idx in range(len(weight_keys)):
            if idx not in load_id:
                not_load_weight_name.append(weight_keys[idx])

        if len(not_load_weight_name) > 0:
            logger.info('{} in pretrained weight is not used in the model, '
                        'and its will not be loaded'.format(
                            not_load_weight_name))
    matched_keys = {}
    result_state_dict = {}
    for model_id, weight_id in enumerate(max_id):
        if weight_id == -1:
            continue
        model_key = model_keys[model_id]
        weight_key = weight_keys[weight_id]
        weight_value = weight_state_dict[weight_key]
        model_value_shape = list(model_state_dict[model_key].shape)

        if list(weight_value.shape) != model_value_shape:
            logger.info(
                'The shape {} in pretrained weight {} is unmatched with '
                'the shape {} in model {}. And the weight {} will not be '
                'loaded'.format(weight_value.shape, weight_key,
                                model_value_shape, model_key, weight_key))
            continue

        assert model_key not in result_state_dict
        result_state_dict[model_key] = weight_value
        if weight_key in matched_keys:
            raise ValueError('Ambiguity weight {} loaded, it matches at least '
                             '{} and {} in the model'.format(
                                 weight_key, model_key, matched_keys[
                                     weight_key]))
        matched_keys[weight_key] = model_key
    return result_state_dict


def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False):
    if is_url(pretrain_weight):
        pretrain_weight = get_weights_path(pretrain_weight)

    path = _strip_postfix(pretrain_weight)
    if not (os.path.isdir(path) or os.path.isfile(path) or
            os.path.exists(path + '.pdparams')):
        raise ValueError("Model pretrain path `{}` does not exists. "
                         "If you don't want to load pretrain model, "
                         "please delete `pretrain_weights` field in "
                         "config file.".format(path))
    teacher_student_flag = False
    if not ARSL_eval:
        if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
            print('Loading pretrain weights for Teacher-Student framework.')
            print(
                'Assert Teacher model has the same structure with Student model.'
            )
            model_dict = model.modelStudent.state_dict()
            teacher_student_flag = True
        else:
            model_dict = model.state_dict()

        weights_path = path + '.pdparams'
        param_state_dict = paddle.load(weights_path)
        param_state_dict = match_state_dict(model_dict, param_state_dict)
        for k, v in param_state_dict.items():
            if isinstance(v, np.ndarray):
                v = paddle.to_tensor(v)
            if model_dict[k].dtype != v.dtype:
                param_state_dict[k] = v.astype(model_dict[k].dtype)

        if teacher_student_flag:
            model.modelStudent.set_dict(param_state_dict)
            model.modelTeacher.set_dict(param_state_dict)
        else:
            model.set_dict(param_state_dict)
        logger.info('Finish loading model weights: {}'.format(weights_path))

    else:
        weights_path = path + '.pdparams'
        param_state_dict = paddle.load(weights_path)
        student_model_dict = model.modelStudent.state_dict()
        student_param_state_dict = match_state_dict(
            student_model_dict, param_state_dict, mode='student')
        model.modelStudent.set_dict(student_param_state_dict)
        print('Loading pretrain weights for Teacher model.')
        teacher_model_dict = model.modelTeacher.state_dict()

        teacher_param_state_dict = match_state_dict(
            teacher_model_dict, param_state_dict, mode='teacher')
        model.modelTeacher.set_dict(teacher_param_state_dict)
        logger.info('Finish loading model weights: {}'.format(weights_path))


def save_model(model,
               optimizer,
               save_dir,
               save_name,
               last_epoch,
               ema_model=None):
    """
    save model into disk.

    Args:
        model (dict): the model state_dict to save parameters.
        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
            save optimizer states.
        save_dir (str): the directory to be saved.
        save_name (str): the path to be saved.
        last_epoch (int): the epoch index.
        ema_model (dict|None): the ema_model state_dict to save parameters.
    """
    if paddle.distributed.get_rank() != 0:
        return
        
    save_dir = os.path.normpath(save_dir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    if save_name == "best_model":
        best_model_path = os.path.join(save_dir, 'best_model')
        if not os.path.exists(best_model_path):
            os.makedirs(best_model_path)

    save_path = os.path.join(save_dir, save_name)
    # save model
    if isinstance(model, nn.Layer):
        paddle.save(model.state_dict(), save_path + ".pdparams")
        best_model = model.state_dict()
    else:
        assert isinstance(model,
                          dict), 'model is not a instance of nn.layer or dict'
        if ema_model is None:
            paddle.save(model, save_path + ".pdparams")
            best_model = model
        else:
            assert isinstance(ema_model,
                              dict), ("ema_model is not a instance of dict, "
                                      "please call model.state_dict() to get.")
            # Exchange model and ema_model to save
            paddle.save(ema_model, save_path + ".pdparams")
            paddle.save(model, save_path + ".pdema")
            best_model = ema_model

    if save_name == 'best_model':
        best_model_path = os.path.join(best_model_path, 'model')
        paddle.save(best_model, best_model_path + ".pdparams")
    # save optimizer
    state_dict = optimizer.state_dict()
    state_dict['last_epoch'] = last_epoch
    paddle.save(state_dict, save_path + ".pdopt")
    logger.info("Save checkpoint: {}".format(save_dir))


def save_semi_model(teacher_model, student_model, optimizer, save_dir,
                    save_name, last_epoch, last_iter):
    """
    save teacher and student model into disk.
    Args:
        teacher_model (dict): the teacher_model state_dict to save parameters.
        student_model (dict): the student_model state_dict to save parameters.
        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
            save optimizer states.
        save_dir (str): the directory to be saved.
        save_name (str): the path to be saved.
        last_epoch (int): the epoch index.
        last_iter (int): the iter index.
    """
    if paddle.distributed.get_rank() != 0:
        return
    assert isinstance(teacher_model, dict), (
        "teacher_model is not a instance of dict, "
        "please call teacher_model.state_dict() to get.")
    assert isinstance(student_model, dict), (
        "student_model is not a instance of dict, "
        "please call student_model.state_dict() to get.")
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, save_name)
    # save model
    paddle.save(teacher_model, save_path + str(last_epoch) + "epoch_t.pdparams")
    paddle.save(student_model, save_path + str(last_epoch) + "epoch_s.pdparams")

    # save optimizer
    state_dict = optimizer.state_dict()
    state_dict['last_epoch'] = last_epoch
    state_dict['last_iter'] = last_iter
    paddle.save(state_dict, save_path + str(last_epoch) + "epoch.pdopt")
    logger.info("Save checkpoint: {}".format(save_dir))

def save_model_info(model_info, save_path, prefix):
    """
    save model info to the target path
    """
    save_path = os.path.join(save_path, prefix)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with open(os.path.join(save_path, f'{prefix}.info.json'), 'w') as f:
        json.dump(model_info, f)
    logger.info("Already save model info in {}".format(save_path))

def update_train_results(config,
                         prefix,
                         metric_info,
                         done_flag=False,
                         last_num=5,
                         ema=False):
    if paddle.distributed.get_rank() != 0:
        return
    assert last_num >= 1
    train_results_path = os.path.join(config["save_dir"],
                                      "train_result.json")
    save_model_tag = ["pdparams", "pdopt", "pdstates"]
    save_inference_tag = [
        "inference_config", "pdmodel", "pdiparams", "pdiparams.info"
    ]
    if ema:
        save_model_tag.append("pdema")
    if os.path.exists(train_results_path):
        with open(train_results_path, "r") as fp:
            train_results = json.load(fp)
    else:
        train_results = {}
        train_results["model_name"] = config["pdx_model_name"]
        train_results["label_dict"] = ""
        train_results["visualdl_log"] = ""
        train_results["train_log"] = "train.log"
        train_results["config"] = "config.yaml"
        train_results["models"] = {}
        for i in range(1, last_num + 1):
            train_results["models"][f"last_{i}"] = {}
        train_results["models"]["best"] = {}
    train_results["done_flag"] = done_flag
    if prefix == "best_model":
        train_results["models"]["best"]["score"] = metric_info["metric"]
        for tag in save_model_tag:
            train_results["models"]["best"][tag] = os.path.join(
                prefix, f"{prefix}.{tag}")
        for tag in save_inference_tag:
            train_results["models"]["best"][tag] = os.path.join(
                prefix, "inference", f"inference.{tag}" if tag != "inference_config" else "inference.yml")
    else:
        for i in range(last_num - 1, 0, -1):
            train_results["models"][f"last_{i + 1}"] = train_results["models"][
                f"last_{i}"].copy()
        train_results["models"][f"last_{1}"]["score"] = metric_info["metric"]
        for tag in save_model_tag:
            train_results["models"][f"last_{1}"][tag] = os.path.join(
                prefix, f"{prefix}.{tag}")
        for tag in save_inference_tag:
            train_results["models"][f"last_{1}"][tag] = os.path.join(
                prefix, "inference", f"inference.{tag}" if tag != "inference_config" else "inference.yml")

    with open(train_results_path, "w") as fp:
        json.dump(train_results, fp)

================================================
FILE: ppdet/utils/cli.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser, RawDescriptionHelpFormatter

import yaml
import re
from ppdet.core.workspace import get_registered_modules, dump_value

__all__ = ['ColorTTY', 'ArgsParser']


class ColorTTY(object):
    def __init__(self):
        super(ColorTTY, self).__init__()
        self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']

    def __getattr__(self, attr):
        if attr in self.colors:
            color = self.colors.index(attr) + 31

            def color_message(message):
                return "[{}m{}[0m".format(color, message)

            setattr(self, attr, color_message)
            return color_message

    def bold(self, message):
        return self.with_code('01', message)

    def with_code(self, code, message):
        return "[{}m{}[0m".format(code, message)


class ArgsParser(ArgumentParser):
    def __init__(self):
        super(ArgsParser, self).__init__(
            formatter_class=RawDescriptionHelpFormatter)
        self.add_argument("-c", "--config", help="configuration file to use")
        self.add_argument(
            "-o", "--opt", nargs='*', help="set configuration options")

    def parse_args(self, argv=None):
        args = super(ArgsParser, self).parse_args(argv)
        assert args.config is not None, \
            "Please specify --config=configure_file_path."
        args.opt = self._parse_opt(args.opt)
        return args

    def _parse_opt(self, opts):
        config = {}
        if not opts:
            return config
        for s in opts:
            s = s.strip()
            k, v = s.split('=', 1)
            if '.' not in k:
                config[k] = yaml.load(v, Loader=yaml.Loader)
            else:
                keys = k.split('.')
                if keys[0] not in config:
                    config[keys[0]] = {}
                cur = config[keys[0]]
                for idx, key in enumerate(keys[1:]):
                    if idx == len(keys) - 2:
                        cur[key] = yaml.load(v, Loader=yaml.Loader)
                    else:
                        cur[key] = {}
                        cur = cur[key]
        return config


def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']):
    for k, v in vars(args).items():
        if k not in exclude_args:
            config[k] = v
    return config


def print_total_cfg(config):
    modules = get_registered_modules()
    color_tty = ColorTTY()
    green = '___{}___'.format(color_tty.colors.index('green') + 31)

    styled = {}
    for key in config.keys():
        if not config[key]:  # empty schema
            continue

        if key not in modules and not hasattr(config[key], '__dict__'):
            styled[key] = config[key]
            continue
        elif key in modules:
            module = modules[key]
        else:
            type_name = type(config[key]).__name__
            if type_name in modules:
                module = modules[type_name].copy()
                module.update({
                    k: v
                    for k, v in config[key].__dict__.items()
                    if k in module.schema
                })
                key += " ({})".format(type_name)
        default = module.find_default_keys()
        missing = module.find_missing_keys()
        mismatch = module.find_mismatch_keys()
        extra = module.find_extra_keys()
        dep_missing = []
        for dep in module.inject:
            if isinstance(module[dep], str) and module[dep] != '<value>':
                if module[dep] not in modules:  # not a valid module
                    dep_missing.append(dep)
                else:
                    dep_mod = modules[module[dep]]
                    # empty dict but mandatory
                    if not dep_mod and dep_mod.mandatory():
                        dep_missing.append(dep)
        override = list(
            set(module.keys()) - set(default) - set(extra) - set(dep_missing))
        replacement = {}
        for name in set(override + default + extra + mismatch + missing):
            new_name = name
            if name in missing:
                value = "<missing>"
            else:
                value = module[name]

            if name in extra:
                value = dump_value(value) + " <extraneous>"
            elif name in mismatch:
                value = dump_value(value) + " <type mismatch>"
            elif name in dep_missing:
                value = dump_value(value) + " <module config missing>"
            elif name in override and value != '<missing>':
                mark = green
                new_name = mark + name
            replacement[new_name] = value
        styled[key] = replacement
    buffer = yaml.dump(styled, default_flow_style=False, default_style='')
    buffer = (re.sub(r"<missing>", r"[31m<missing>[0m", buffer))
    buffer = (re.sub(r"<extraneous>", r"[33m<extraneous>[0m", buffer))
    buffer = (re.sub(r"<type mismatch>", r"[31m<type mismatch>[0m", buffer))
    buffer = (re.sub(r"<module config missing>",
                     r"[31m<module config missing>[0m", buffer))
    buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer)
    print(buffer)


================================================
FILE: ppdet/utils/colormap.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np


def colormap(rgb=False):
    """
    Get colormap

    The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py
    """
    color_list = np.array([
        0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494,
        0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078,
        0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000,
        1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000,
        0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667,
        0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000,
        0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000,
        1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000,
        0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500,
        0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667,
        0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333,
        0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000,
        0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333,
        0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000,
        1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000,
        1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167,
        0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000,
        0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000,
        0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000,
        0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000,
        0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833,
        0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286,
        0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714,
        0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000
    ]).astype(np.float32)
    color_list = color_list.reshape((-1, 3)) * 255
    if not rgb:
        color_list = color_list[:, ::-1]
    return color_list.astype('int32')


================================================
FILE: ppdet/utils/compact.py
================================================
import PIL

def imagedraw_textsize_c(draw, text, font=None):
    if int(PIL.__version__.split('.')[0]) < 10:
        tw, th = draw.textsize(text, font=font)
    else:
        left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
        tw, th = right - left, bottom - top

    return tw, th


================================================
FILE: ppdet/utils/download.py
================================================
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import os.path as osp
import sys
import yaml
import time
import shutil
import requests
import tqdm
import hashlib
import base64
import binascii
import tarfile
import zipfile
import errno

from paddle.utils.download import _get_unique_endpoints
from ppdet.core.workspace import BASE_KEY
from .logger import setup_logger
from .voc_utils import create_list

logger = setup_logger(__name__)

__all__ = [
    'get_weights_path', 'get_dataset_path', 'get_config_path',
    'download_dataset', 'create_voc_list'
]

WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs")

# dict of {dataset_name: (download_info, sub_dirs)}
# download info: [(url, md5sum)]
DATASETS = {
    'coco': ([
        (
            'http://images.cocodataset.org/zips/train2017.zip',
            'cced6f7f71b7629ddf16f17bbcfab6b2', ),
        (
            'http://images.cocodataset.org/zips/val2017.zip',
            '442b8da7639aecaf257c1dceb8ba8c80', ),
        (
            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
            'f4bbac642086de4f52a3fdda2de5fa2c', ),
    ], ["annotations", "train2017", "val2017"]),
    'voc': ([
        (
            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
            '6cd6e144f989b92b3379bac3b3de84fd', ),
        (
            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
            'c52e279531787c972589f7e41ab4ae64', ),
        (
            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
            'b6e924de25625d8de591ea690078ad9f', ),
        (
            'https://paddledet.bj.bcebos.com/data/label_list.txt',
            '5ae5d62183cfb6f6d3ac109359d06a1b', ),
    ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]),
    'wider_face': ([
        (
            'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',
            '3fedf70df600953d25982bcd13d91ba2', ),
        (
            'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',
            'dfa7d7e790efa35df3788964cf0bbaea', ),
        (
            'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',
            'a4a898d6193db4b9ef3260a68bad0dc7', ),
    ], ["WIDER_train", "WIDER_val", "wider_face_split"]),
    'fruit': ([(
        'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',
        'baa8806617a54ccf3685fa7153388ae6', ), ],
              ['Annotations', 'JPEGImages']),
    'roadsign_voc': ([(
        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',
        '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),
    'roadsign_coco': ([(
        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',
        '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),
    'spine_coco': ([(
        'https://paddledet.bj.bcebos.com/data/spine.tar',
        '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']),
    'coco_ce': ([(
        'https://paddledet.bj.bcebos.com/data/coco_ce.tar',
        'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []),
    'culane': ([('https://bj.bcebos.com/v1/paddledet/data/culane.tar', None, ), ], [])
}

DOWNLOAD_DATASETS_LIST = DATASETS.keys()

DOWNLOAD_RETRY_LIMIT = 3

PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'


# When running unit tests, there could be multiple processes that
# trying to create DATA_HOME directory simultaneously, so we cannot
# use a if condition to check for the existence of the directory;
# instead, we use the filesystem as the synchronization mechanism by
# catching returned errors.
def must_mkdirs(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass


def parse_url(url):
    url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)
    return url


def get_weights_path(url):
    """Get weights path from WEIGHTS_HOME, if not exists,
    download it from url.
    """
    url = parse_url(url)
    path, _ = get_path(url, WEIGHTS_HOME)
    return path


def get_config_path(url):
    """Get weights path from CONFIGS_HOME, if not exists,
    download it from url.
    """
    url = parse_url(url)
    path = map_path(url, CONFIGS_HOME, path_depth=2)
    if os.path.isfile(path):
        return path

    # config file not found, try download
    # 1. clear configs directory
    if osp.isdir(CONFIGS_HOME):
        shutil.rmtree(CONFIGS_HOME)

    # 2. get url
    try:
        from ppdet import __version__ as version
    except ImportError:
        version = None

    cfg_url = "ppdet://configs/{}/configs.tar".format(version) \
                if version else "ppdet://configs/configs.tar"
    cfg_url = parse_url(cfg_url)

    # 3. download and decompress
    cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME))
    _decompress_dist(cfg_fullname)

    # 4. check config file existing
    if os.path.isfile(path):
        return path
    else:
        logger.error("Get config {} failed after download, please contact us on " \
            "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path))
        sys.exit(1)


def get_dataset_path(path, annotation, image_dir):
    """
    If path exists, return path.
    Otherwise, get dataset path from DATASET_HOME, if not exists,
    download it.
    """
    if _dataset_exists(path, annotation, image_dir):
        return path

    data_name = os.path.split(path.strip().lower())[-1]
    if data_name not in DOWNLOAD_DATASETS_LIST:
        raise ValueError(
            "Dataset {} is not valid for reason above, please check again.".
            format(osp.realpath(path)))
    else:
        logger.warning(
            "Dataset {} is not valid for reason above, try searching {} or "
            "downloading dataset...".format(osp.realpath(path), DATASET_HOME))

    for name, dataset in DATASETS.items():
        if data_name == name:
            logger.debug("Parse dataset_dir {} as dataset "
                         "{}".format(path, name))
            data_dir = osp.join(DATASET_HOME, name)

            if name == "spine_coco":
                if _dataset_exists(data_dir, annotation, image_dir):
                    return data_dir

            # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007
            if name in ['voc', 'fruit', 'roadsign_voc']:
                exists = True
                for sub_dir in dataset[1]:
                    check_dir = osp.join(data_dir, sub_dir)
                    if osp.exists(check_dir):
                        logger.info("Found {}".format(check_dir))
                    else:
                        exists = False
                if exists:
                    return data_dir

            # voc exist is checked above, voc is not exist here
            check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'
            for url, md5sum in dataset[0]:
                get_path(url, data_dir, md5sum, check_exist)

            # voc should create list after download
            if name == 'voc':
                create_voc_list(data_dir)
            return data_dir

    raise ValueError("Dataset automaticly downloading Error.")


def create_voc_list(data_dir, devkit_subdir='VOCdevkit'):
    logger.debug("Create voc file list...")
    devkit_dir = osp.join(data_dir, devkit_subdir)
    years = ['2007', '2012']

    # NOTE: since using auto download VOC
    # dataset, VOC default label list should be used,
    # do not generate label_list.txt here. For default
    # label, see ../data/source/voc.py
    create_list(devkit_dir, years, data_dir)
    logger.debug("Create voc file list finished")


def map_path(url, root_dir, path_depth=1):
    # parse path after download to decompress under root_dir
    assert path_depth > 0, "path_depth should be a positive integer"
    dirname = url
    for _ in range(path_depth):
        dirname = osp.dirname(dirname)
    fpath = osp.relpath(url, dirname)

    zip_formats = ['.zip', '.tar', '.gz']
    for zip_format in zip_formats:
        fpath = fpath.replace(zip_format, '')
    return osp.join(root_dir, fpath)


def get_path(url, root_dir, md5sum=None, check_exist=True):
    """ Download from given url to root_dir.
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    url (str): download url
    root_dir (str): root dir for downloading, it should be
                    WEIGHTS_HOME or DATASET_HOME
    md5sum (str): md5 sum of download package
    """
    # parse path after download to decompress under root_dir
    fullpath = map_path(url, root_dir)

    # For same zip file, decompressed directory name different
    # from zip file name, rename by following map
    decompress_name_map = {
        "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012",
        "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007",
        "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007",
        "annotations_trainval": "annotations"
    }
    for k, v in decompress_name_map.items():
        if fullpath.find(k) >= 0:
            fullpath = osp.join(osp.split(fullpath)[0], v)

    if osp.exists(fullpath) and check_exist:
        if not osp.isfile(fullpath) or \
                _check_exist_file_md5(fullpath, md5sum, url):
            logger.debug("Found {}".format(fullpath))
            return fullpath, True
        else:
            os.remove(fullpath)

    fullname = _download_dist(url, root_dir, md5sum)

    # new weights format which postfix is 'pdparams' not
    # need to decompress
    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml', '.ttf']:
        _decompress_dist(fullname)

    return fullpath, False


def download_dataset(path, dataset=None):
    if dataset not in DATASETS.keys():
        logger.error("Unknown dataset {}, it should be "
                     "{}".format(dataset, DATASETS.keys()))
        return
    dataset_info = DATASETS[dataset][0]
    for info in dataset_info:
        get_path(info[0], path, info[1], False)
    logger.debug("Download dataset {} finished.".format(dataset))


def _dataset_exists(path, annotation, image_dir):
    """
    Check if user define dataset exists
    """
    if not osp.exists(path):
        logger.warning("Config dataset_dir {} is not exits, "
                       "dataset config is not valid".format(path))
        return False

    if annotation:
        annotation_path = osp.join(path, annotation)
        if not osp.isfile(annotation_path):
            logger.warning("Config annotation {} is not a "
                           "file, dataset config is not "
                           "valid".format(annotation_path))
            return False
    if image_dir:
        image_path = osp.join(path, image_dir)
        if not osp.isdir(image_path):
            logger.warning("Config image_dir {} is not a "
                           "directory, dataset config is not "
                           "valid".format(image_path))
            return False
    return True


def _download(url, path, md5sum=None):
    """
    Download from url, save to path.

    url (str): download url
    path (str): download to given path
    """
    must_mkdirs(path)

    fname = osp.split(url)[-1]
    fullname = osp.join(path, fname)
    retry_cnt = 0

    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
                                                              url)):
        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
            retry_cnt += 1
        else:
            raise RuntimeError("Download from {} failed. "
                               "Retry limit reached".format(url))

        logger.info("Downloading {} from {}".format(fname, url))

        # NOTE: windows path join may incur \, which is invalid in url
        if sys.platform == "win32":
            url = url.replace('\\', '/')

        req = requests.get(url, stream=True)
        if req.status_code != 200:
            raise RuntimeError("Downloading from {} failed with code "
                               "{}!".format(url, req.status_code))

        # For protecting download interupted, download to
        # tmp_fullname firstly, move tmp_fullname to fullname
        # after download finished
        tmp_fullname = fullname + "_tmp"
        total_size = req.headers.get('content-length')
        with open(tmp_fullname, 'wb') as f:
            if total_size:
                for chunk in tqdm.tqdm(
                        req.iter_content(chunk_size=1024),
                        total=(int(total_size) + 1023) // 1024,
                        unit='KB'):
                    f.write(chunk)
            else:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
        shutil.move(tmp_fullname, fullname)
    return fullname


def _download_dist(url, path, md5sum=None):
    env = os.environ
    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
        # Mainly used to solve the problem of downloading data from
        # different machines in the case of multiple machines.
        # Different nodes will download data, and the same node
        # will only download data once.
        # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108
        rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
        if num_trainers <= 1:
            return _download(url, path, md5sum)
        else:
            fname = osp.split(url)[-1]
            fullname = osp.join(path, fname)
            lock_path = fullname + '.download.lock'

            must_mkdirs(path)

            if not osp.exists(fullname):
                with open(lock_path, 'w'):  # touch
                    os.utime(lock_path, None)
                if rank_id_curr_node == 0:
                    _download(url, path, md5sum)
                    os.remove(lock_path)
                else:
                    while os.path.exists(lock_path):
                        time.sleep(0.5)
            return fullname
    else:
        return _download(url, path, md5sum)


def _check_exist_file_md5(filename, md5sum, url):
    # if md5sum is None, and file to check is weights file,
    # read md5um from url and check, else check md5sum directly
    return _md5check_from_url(filename, url) if md5sum is None \
            and filename.endswith('pdparams') \
            else _md5check(filename, md5sum)


def _md5check_from_url(filename, url):
    # For weights in bcebos URLs, MD5 value is contained
    # in request header as 'content_md5'
    req = requests.get(url, stream=True)
    content_md5 = req.headers.get('content-md5')
    req.close()
    if not content_md5 or _md5check(
            filename,
            binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
            )):
        return True
    else:
        return False


def _md5check(fullname, md5sum=None):
    if md5sum is None:
        return True

    logger.debug("File {} md5 checking...".format(fullname))
    md5 = hashlib.md5()
    with open(fullname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5.update(chunk)
    calc_md5sum = md5.hexdigest()

    if calc_md5sum != md5sum:
        logger.warning("File {} md5 check failed, {}(calc) != "
                       "{}(base)".format(fullname, calc_md5sum, md5sum))
        return False
    return True


def _decompress(fname):
    """
    Decompress for zip and tar file
    """
    logger.info("Decompressing {}...".format(fname))

    # For protecting decompressing interupted,
    # decompress to fpath_tmp directory firstly, if decompress
    # successed, move decompress files to fpath and delete
    # fpath_tmp and remove download compress file.
    fpath = osp.split(fname)[0]
    fpath_tmp = osp.join(fpath, 'tmp')
    if osp.isdir(fpath_tmp):
        shutil.rmtree(fpath_tmp)
        os.makedirs(fpath_tmp)

    if fname.find('tar') >= 0:
        with tarfile.open(fname) as tf:
            tf.extractall(path=fpath_tmp)
    elif fname.find('zip') >= 0:
        with zipfile.ZipFile(fname) as zf:
            zf.extractall(path=fpath_tmp)
    elif fname.find('.txt') >= 0:
        return
    else:
        raise TypeError("Unsupport compress file type {}".format(fname))

    for f in os.listdir(fpath_tmp):
        src_dir = osp.join(fpath_tmp, f)
        dst_dir = osp.join(fpath, f)
        _move_and_merge_tree(src_dir, dst_dir)

    shutil.rmtree(fpath_tmp)
    os.remove(fname)


def _decompress_dist(fname):
    env = os.environ
    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
        if num_trainers <= 1:
            _decompress(fname)
        else:
            lock_path = fname + '.decompress.lock'
            from paddle.distributed import ParallelEnv
            unique_endpoints = _get_unique_endpoints(ParallelEnv()
                                                     .trainer_endpoints[:])
            # NOTE(dkp): _decompress_dist always performed after
            # _download_dist, in _download_dist sub-trainers is waiting
            # for download lock file release with sleeping, if decompress
            # prograss is very fast and finished with in the sleeping gap
            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
            # trainer may finish decompress and release lock file, so we
            # only craete lock file in main trainer and all sub-trainer
            # wait 1s for main trainer to create lock file, for 1s is
            # twice as sleeping gap, this waiting time can keep all
            # trainer pipeline in order
            # **change this if you have more elegent methods**
            if ParallelEnv().current_endpoint in unique_endpoints:
                with open(lock_path, 'w'):  # touch
                    os.utime(lock_path, None)
                _decompress(fname)
                os.remove(lock_path)
            else:
                time.sleep(1)
                while os.path.exists(lock_path):
                    time.sleep(0.5)
    else:
        _decompress(fname)


def _move_and_merge_tree(src, dst):
    """
    Move src directory to dst, if dst is already exists,
    merge src to dst
    """
    if not osp.exists(dst):
        shutil.move(src, dst)
    elif osp.isfile(src):
        shutil.move(src, dst)
    else:
        for fp in os.listdir(src):
            src_fp = osp.join(src, fp)
            dst_fp = osp.join(dst, fp)
            if osp.isdir(src_fp):
                if osp.isdir(dst_fp):
                    _move_and_merge_tree(src_fp, dst_fp)
                else:
                    shutil.move(src_fp, dst_fp)
            elif osp.isfile(src_fp) and \
                    not osp.isfile(dst_fp):
                shutil.move(src_fp, dst_fp)


================================================
FILE: ppdet/utils/fuse_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import paddle
import paddle.nn as nn

__all__ = ['fuse_conv_bn']


def fuse_conv_bn(model):
    is_train = False
    if model.training:
        model.eval()
        is_train = True
    fuse_list = []
    tmp_pair = [None, None]
    for name, layer in model.named_sublayers():
        if isinstance(layer, nn.Conv2D):
            tmp_pair[0] = name
        if isinstance(layer, nn.BatchNorm2D):
            tmp_pair[1] = name

        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
            fuse_list.append(tmp_pair)
            tmp_pair = [None, None]
    model = fuse_layers(model, fuse_list)
    if is_train:
        model.train()
    return model


def find_parent_layer_and_sub_name(model, name):
    """
    Given the model and the name of a layer, find the parent layer and
    the sub_name of the layer.
    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
    'block_1/convbn_1' and the sub_name is `conv_1`.
    Args:
        model(paddle.nn.Layer): the model to be quantized.
        name(string): the name of a layer

    Returns:
        parent_layer, subname
    """
    assert isinstance(model, nn.Layer), \
            "The model must be the instance of paddle.nn.Layer."
    assert len(name) > 0, "The input (name) should not be empty."

    last_idx = 0
    idx = 0
    parent_layer = model
    while idx < len(name):
        if name[idx] == '.':
            sub_name = name[last_idx:idx]
            if hasattr(parent_layer, sub_name):
                parent_layer = getattr(parent_layer, sub_name)
                last_idx = idx + 1
        idx += 1
    sub_name = name[last_idx:idx]
    return parent_layer, sub_name


class Identity(nn.Layer):
    '''a layer to replace bn or relu layers'''

    def __init__(self, *args, **kwargs):
        super(Identity, self).__init__()

    def forward(self, input):
        return input


def fuse_layers(model, layers_to_fuse, inplace=False):
    '''
       fuse layers in layers_to_fuse

       Args:
           model(nn.Layer): The model to be fused.
           layers_to_fuse(list): The layers' names to be fused. For
               example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]".
               A TypeError would be raised if "fuse" was set as
               True but "fuse_list" was None.
                                 Default: None.
           inplace(bool): Whether apply fusing to the input model.
                          Default: False.

       Return
           fused_model(paddle.nn.Layer): The fused model.
    '''
    if not inplace:
        model = copy.deepcopy(model)
    for layers_list in layers_to_fuse:
        layer_list = []
        for layer_name in layers_list:
            parent_layer, sub_name = find_parent_layer_and_sub_name(model,
                                                                    layer_name)
            layer_list.append(getattr(parent_layer, sub_name))
        new_layers = _fuse_func(layer_list)
        for i, item in enumerate(layers_list):
            parent_layer, sub_name = find_parent_layer_and_sub_name(model, item)
            setattr(parent_layer, sub_name, new_layers[i])
    return model


def _fuse_func(layer_list):
    '''choose the fuser method and fuse layers'''
    types = tuple(type(m) for m in layer_list)
    fusion_method = types_to_fusion_method.get(types, None)
    new_layers = [None] * len(layer_list)
    fused_layer = fusion_method(*layer_list)
    for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items():
        fused_layer.register_forward_pre_hook(pre_hook_fn)
        del layer_list[0]._forward_pre_hooks[handle_id]
    for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items():
        fused_layer.register_forward_post_hook(hook_fn)
        del layer_list[-1]._forward_post_hooks[handle_id]
    new_layers[0] = fused_layer
    for i in range(1, len(layer_list)):
        identity = Identity()
        identity.training = layer_list[0].training
        new_layers[i] = identity
    return new_layers


def _fuse_conv_bn(conv, bn):
    '''fuse conv and bn for train or eval'''
    assert(conv.training == bn.training),\
        "Conv and BN both must be in the same mode (train or eval)."
    if conv.training:
        assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
        raise NotImplementedError
    else:
        return _fuse_conv_bn_eval(conv, bn)


def _fuse_conv_bn_eval(conv, bn):
    '''fuse conv and bn for eval'''
    assert (not (conv.training or bn.training)), "Fusion only for eval!"
    fused_conv = copy.deepcopy(conv)

    fused_weight, fused_bias = _fuse_conv_bn_weights(
        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,
        bn.weight, bn.bias)
    fused_conv.weight.set_value(fused_weight)
    if fused_conv.bias is None:
        fused_conv.bias = paddle.create_parameter(
            shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype)
    fused_conv.bias.set_value(fused_bias)
    return fused_conv


def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
    '''fuse weights and bias of conv and bn'''
    if conv_b is None:
        conv_b = paddle.zeros_like(bn_rm)
    if bn_w is None:
        bn_w = paddle.ones_like(bn_rm)
    if bn_b is None:
        bn_b = paddle.zeros_like(bn_rm)
    bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps)
    conv_w = conv_w * \
        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
    return conv_w, conv_b


types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, }


================================================
FILE: ppdet/utils/logger.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys

import paddle.distributed as dist

__all__ = ['setup_logger']

logger_initialized = []


def setup_logger(name="ppdet", output=None, log_ranks="0"):
    """
    Initialize logger and set its verbosity level to INFO.
    Args:
        output (str): a file name or a directory to save log. If None, will not save log file.
            If ends with ".txt" or ".log", assumed to be a file name.
            Otherwise, logs will be saved to `output/log.txt`.
        name (str): the root module name of this logger
        log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default.

    Returns:
        logging.Logger: a logger
    """
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger

    logger.setLevel(logging.INFO)
    logger.propagate = False

    formatter = logging.Formatter(
        "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
        datefmt="%m/%d %H:%M:%S")
    
    if isinstance(log_ranks, str):
        log_ranks = [int(i) for i in log_ranks.split(',')]
    elif isinstance(log_ranks, int):
        log_ranks = [log_ranks]

    # stdout logging: master only
    local_rank = dist.get_rank()
    if local_rank in log_ranks:
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        logger.addHandler(ch)

    # file logging: all workers
    if output is not None:
        if output.endswith(".txt") or output.endswith(".log"):
            filename = output
        else:
            filename = os.path.join(output, "log.txt")
        if local_rank > 0:
            filename = filename + ".rank{}".format(local_rank)
        os.makedirs(os.path.dirname(filename))
        fh = logging.FileHandler(filename, mode='a')
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(logging.Formatter())
        logger.addHandler(fh)

    logger_initialized.append(name)
    return logger


================================================
FILE: ppdet/utils/profiler.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import paddle
import paddle.profiler as profiler

# A global variable to record the number of calling times for profiler
# functions. It is used to specify the tracing range of training steps.
_profiler_step_id = 0

# A global variable to avoid parsing from string every time.
_profiler_options = None
_prof = None

class ProfilerOptions(object):
    '''
    Use a string to initialize a ProfilerOptions.
    The string should be in the format: "key1=value1;key2=value;key3=value3".
    For example:
      "profile_path=model.profile"
      "batch_range=[50, 60]; profile_path=model.profile"
      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"

    ProfilerOptions supports following key-value pair:
      batch_range      - a integer list, e.g. [100, 110].
      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
      sorted_key       - a string, the optional values are 'calls', 'total',
                         'max', 'min' or 'ave.
      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
                         'AllOpDetail'.
      profile_path     - a string, the path to save the serialized profile data,
                         which can be used to generate a timeline.
      exit_on_finished - a boolean.
    '''

    def __init__(self, options_str):
        assert isinstance(options_str, str)

        self._options = {
            'batch_range': [10, 20],
            'state': 'All',
            'sorted_key': 'total',
            'tracer_option': 'Default',
            'profile_path': '/tmp/profile',
            'exit_on_finished': True,
            'timer_only': True
        }
        self._parse_from_string(options_str)

    def _parse_from_string(self, options_str):
        for kv in options_str.replace(' ', '').split(';'):
            key, value = kv.split('=')
            if key == 'batch_range':
                value_list = value.replace('[', '').replace(']', '').split(',')
                value_list = list(map(int, value_list))
                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
                        1] > value_list[0]:
                    self._options[key] = value_list
            elif key == 'exit_on_finished':
                self._options[key] = value.lower() in ("yes", "true", "t", "1")
            elif key in [
                    'state', 'sorted_key', 'tracer_option', 'profile_path'
            ]:
                self._options[key] = value
            elif key == 'timer_only':
                self._options[key] = value

    def __getitem__(self, name):
        if self._options.get(name, None) is None:
            raise ValueError(
                "ProfilerOptions does not have an option named %s." % name)
        return self._options[name]


def add_profiler_step(options_str=None):
    '''
    Enable the operator-level timing using PaddlePaddle's profiler.
    The profiler uses a independent variable to count the profiler steps.
    One call of this function is treated as a profiler step.
    Args:
      profiler_options - a string to initialize the ProfilerOptions.
                         Default is None, and the profiler is disabled.
    '''
    if options_str is None:
        return

    global _prof 
    global _profiler_step_id
    global _profiler_options

    if _profiler_options is None:
        _profiler_options = ProfilerOptions(options_str)
    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
    # timer_only = True  only the model's throughput and time overhead are displayed
    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
    # timer_only = False the output Timeline information can be found in the profiler_log directory
    if _prof is None:
        _timer_only = str(_profiler_options['timer_only']) == str(True)
        _prof = profiler.Profiler(
                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
                   timer_only = _timer_only)
        _prof.start()
    else:
        _prof.step()
        
    if _profiler_step_id == _profiler_options['batch_range'][1]:
        _prof.stop()
        _prof.summary(
             op_detail=True,
             thread_sep=False,
             time_unit='ms')
        _prof = None
        if _profiler_options['exit_on_finished']:
            sys.exit(0)

    _profiler_step_id += 1


================================================
FILE: ppdet/utils/stats.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import numpy as np

__all__ = ['SmoothedValue', 'TrainingStats']


class SmoothedValue(object):
    """Track a series of values and provide access to smoothed values over a
    window or the global series average.
    """

    def __init__(self, window_size=20, fmt=None):
        if fmt is None:
            fmt = "{median:.4f} ({avg:.4f})"
        self.deque = collections.deque(maxlen=window_size)
        self.fmt = fmt
        self.total = 0.
        self.count = 0

    def update(self, value, n=1):
        self.deque.append(value)
        self.count += n
        self.total += value * n

    @property
    def median(self):
        return np.median(self.deque)

    @property
    def avg(self):
        return np.mean(self.deque)

    @property
    def max(self):
        return np.max(self.deque)

    @property
    def value(self):
        return self.deque[-1]

    @property
    def global_avg(self):
        return self.total / self.count

    def __str__(self):
        return self.fmt.format(
            median=self.median, avg=self.avg, max=self.max, value=self.value)


class TrainingStats(object):
    def __init__(self, window_size, delimiter=' '):
        self.meters = None
        self.window_size = window_size
        self.delimiter = delimiter

    def update(self, stats):
        if self.meters is None:
            self.meters = {
                k: SmoothedValue(self.window_size)
                for k in stats.keys()
            }
        for k, v in self.meters.items():
            v.update(float(stats[k]))

    def get(self, extras=None):
        stats = collections.OrderedDict()
        if extras:
            for k, v in extras.items():
                stats[k] = v
        for k, v in self.meters.items():
            stats[k] = format(v.median, '.6f')

        return stats

    def log(self, extras=None):
        d = self.get(extras)
        strs = []
        for k, v in d.items():
            strs.append("{}: {}".format(k, str(v)))
        return self.delimiter.join(strs)


================================================
FILE: ppdet/utils/visualizer.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2
import math

from .colormap import colormap
from ppdet.utils.logger import setup_logger
from ppdet.utils.compact import imagedraw_textsize_c
from ppdet.utils.download import get_path
logger = setup_logger(__name__)

__all__ = ['visualize_results']


def visualize_results(image,
                      bbox_res,
                      mask_res,
                      segm_res,
                      keypoint_res,
                      pose3d_res,
                      im_id,
                      catid2name,
                      threshold=0.5):
    """
    Visualize bbox and mask results
    """
    if bbox_res is not None:
        image = draw_bbox(image, im_id, catid2name, bbox_res, threshold)
    if mask_res is not None:
        image = draw_mask(image, im_id, mask_res, threshold)
    if segm_res is not None:
        image = draw_segm(image, im_id, catid2name, segm_res, threshold)
    if keypoint_res is not None:
        image = draw_pose(image, keypoint_res, threshold)
    if pose3d_res is not None:
        pose3d = np.array(pose3d_res[0]['pose3d']) * 1000
        image = draw_pose3d(image, pose3d, visual_thread=threshold)
    return image


def draw_mask(image, im_id, segms, threshold, alpha=0.7):
    """
    Draw mask on image
    """
    mask_color_id = 0
    w_ratio = .4
    color_list = colormap(rgb=True)
    img_array = np.array(image).astype('float32')
    for dt in np.array(segms):
        if im_id != dt['image_id']:
            continue
        segm, score = dt['segmentation'], dt['score']
        if score < threshold:
            continue
        import pycocotools.mask as mask_util
        mask = mask_util.decode(segm) * 255
        color_mask = color_list[mask_color_id % len(color_list), 0:3]
        mask_color_id += 1
        for c in range(3):
            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
        idx = np.nonzero(mask)
        img_array[idx[0], idx[1], :] *= 1.0 - alpha
        img_array[idx[0], idx[1], :] += alpha * color_mask
    return Image.fromarray(img_array.astype('uint8'))


def draw_bbox(image, im_id, catid2name, bboxes, threshold):
    """
    Draw bbox on image
    """
    font_url = "https://paddledet.bj.bcebos.com/simfang.ttf"
    font_path, _ = get_path(font_url, "~/.cache/paddle/")
    font_size = 18
    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")

    draw = ImageDraw.Draw(image)

    catid2color = {}
    color_list = colormap(rgb=True)[:40]
    for dt in np.array(bboxes):
        if im_id != dt['image_id']:
            continue
        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
        if score < threshold:
            continue

        if catid not in catid2color:
            idx = np.random.randint(len(color_list))
            catid2color[catid] = color_list[idx]
        color = tuple(catid2color[catid])

        # draw bbox
        if len(bbox) == 4:
            # draw bbox
            xmin, ymin, w, h = bbox
            xmax = xmin + w
            ymax = ymin + h
            draw.line(
                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
                 (xmin, ymin)],
                width=2,
                fill=color)
        elif len(bbox) == 8:
            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
            draw.line(
                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
                width=2,
                fill=color)
            xmin = min(x1, x2, x3, x4)
            ymin = min(y1, y2, y3, y4)
        else:
            logger.error('the shape of bbox must be [M, 4] or [M, 8]!')

        # draw label
        text = "{} {:.2f}".format(catid2name[catid], score)
        tw, th = imagedraw_textsize_c(draw, text, font=font)
        draw.rectangle(
            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255), font=font)

    return image


def save_result(save_path, results, catid2name, threshold):
    """
    save result as txt
    """
    img_id = int(results["im_id"])
    with open(save_path, 'w') as f:
        if "bbox_res" in results:
            for dt in results["bbox_res"]:
                catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
                if score < threshold:
                    continue
                # each bbox result as a line
                # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4
                # for bbox: classname score x1 y1 w h
                bbox_pred = '{} {} '.format(catid2name[catid],
                                            score) + ' '.join(
                                                [str(e) for e in bbox])
                f.write(bbox_pred + '\n')
        elif "keypoint_res" in results:
            for dt in results["keypoint_res"]:
                kpts = dt['keypoints']
                scores = dt['score']
                keypoint_pred = [img_id, scores, kpts]
                print(keypoint_pred, file=f)
        else:
            print("No valid results found, skip txt save")


def draw_segm(image,
              im_id,
              catid2name,
              segms,
              threshold,
              alpha=0.7,
              draw_box=True):
    """
    Draw segmentation on image
    """
    mask_color_id = 0
    w_ratio = .4
    color_list = colormap(rgb=True)
    img_array = np.array(image).astype('float32')
    for dt in np.array(segms):
        if im_id != dt['image_id']:
            continue
        segm, score, catid = dt['segmentation'], dt['score'], dt['category_id']
        if score < threshold:
            continue
        import pycocotools.mask as mask_util
        mask = mask_util.decode(segm) * 255
        color_mask = color_list[mask_color_id % len(color_list), 0:3]
        mask_color_id += 1
        for c in range(3):
            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
        idx = np.nonzero(mask)
        img_array[idx[0], idx[1], :] *= 1.0 - alpha
        img_array[idx[0], idx[1], :] += alpha * color_mask

        if not draw_box:
            center_y, center_x = ndimage.measurements.center_of_mass(mask)
            label_text = "{}".format(catid2name[catid])
            vis_pos = (max(int(center_x) - 10, 0), int(center_y))
            cv2.putText(img_array, label_text, vis_pos,
                        cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255))
        else:
            mask = mask_util.decode(segm) * 255
            sum_x = np.sum(mask, axis=0)
            x = np.where(sum_x > 0.5)[0]
            sum_y = np.sum(mask, axis=1)
            y = np.where(sum_y > 0.5)[0]
            x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
            cv2.rectangle(img_array, (x0, y0), (x1, y1),
                          tuple(color_mask.astype('int32').tolist()), 1)
            bbox_text = '%s %.2f' % (catid2name[catid], score)
            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
            cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0],
                                                y0 - t_size[1] - 3),
                          tuple(color_mask.astype('int32').tolist()), -1)
            cv2.putText(
                img_array,
                bbox_text, (x0, y0 - 2),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.3, (0, 0, 0),
                1,
                lineType=cv2.LINE_AA)

    return Image.fromarray(img_array.astype('uint8'))


def draw_pose(image,
              results,
              visual_thread=0.6,
              save_name='pose.jpg',
              save_dir='output',
              returnimg=False,
              ids=None):
    try:
        import matplotlib.pyplot as plt
        import matplotlib
        plt.switch_backend('agg')
    except Exception as e:
        logger.error('Matplotlib not found, please install matplotlib.'
                     'for example: `pip install matplotlib`.')
        raise e

    skeletons = np.array([item['keypoints'] for item in results])
    kpt_nums = 17
    if len(skeletons) > 0:
        kpt_nums = int(skeletons.shape[1] / 3)
    skeletons = skeletons.reshape(-1, kpt_nums, 3)
    if kpt_nums == 17:  #plot coco keypoint
        EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),
                 (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),
                 (13, 15), (14, 16), (11, 12)]
    else:  #plot mpii keypoint
        EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),
                 (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),
                 (8, 13)]
    NUM_EDGES = len(EDGES)

    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
    cmap = matplotlib.cm.get_cmap('hsv')
    plt.figure()

    img = np.array(image).astype('float32')

    color_set = results['colors'] if 'colors' in results else None

    if 'bbox' in results and ids is None:
        bboxs = results['bbox']
        for j, rect in enumerate(bboxs):
            xmin, ymin, xmax, ymax = rect
            color = colors[0] if color_set is None else colors[color_set[j] %
                                                               len(colors)]
            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)

    canvas = img.copy()
    for i in range(kpt_nums):
        for j in range(len(skeletons)):
            if skeletons[j][i, 2] < visual_thread:
                continue
            if ids is None:
                color = colors[i] if color_set is None else colors[color_set[j]
                                                                   %
                                                                   len(colors)]
            else:
                color = get_color(ids[j])

            cv2.circle(
                canvas,
                tuple(skeletons[j][i, 0:2].astype('int32')),
                2,
                color,
                thickness=-1)

    to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0)
    fig = matplotlib.pyplot.gcf()

    stickwidth = 2

    for i in range(NUM_EDGES):
        for j in range(len(skeletons)):
            edge = EDGES[i]
            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[
                    1], 2] < visual_thread:
                continue

            cur_canvas = canvas.copy()
            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]
            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
            mX = np.mean(X)
            mY = np.mean(Y)
            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
                                       (int(length / 2), stickwidth),
                                       int(angle), 0, 360, 1)
            if ids is None:
                color = colors[i] if color_set is None else colors[color_set[j]
                                                                   %
                                                                   len(colors)]
            else:
                color = get_color(ids[j])
            cv2.fillConvexPoly(cur_canvas, polygon, color)
            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
    image = Image.fromarray(canvas.astype('uint8'))
    plt.close()
    return image


def draw_pose3d(image,
                pose3d,
                pose2d=None,
                visual_thread=0.6,
                save_name='pose3d.jpg',
                returnimg=True):
    try:
        import matplotlib.pyplot as plt
        import matplotlib
        plt.switch_backend('agg')
    except Exception as e:
        logger.error('Matplotlib not found, please install matplotlib.'
                     'for example: `pip install matplotlib`.')
        raise e

    if pose3d.shape[0] == 24:
        joints_connectivity_dict = [
            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1],
            [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0],
            [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1],
            [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0],
            [23, 21, 1]
        ]
    elif pose3d.shape[0] == 14:
        joints_connectivity_dict = [
            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0],
            [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1],
            [8, 12, 0], [9, 12, 1], [12, 13, 1]
        ]
    else:
        print(
            "not defined joints number :{}, cannot visualize because unknown of joint connectivity".
            format(pose.shape[0]))
        return

    def draw3Dpose(pose3d,
                   ax,
                   lcolor="#3498db",
                   rcolor="#e74c3c",
                   add_labels=False):
        #    pose3d = orthographic_projection(pose3d, cam)
        for i in joints_connectivity_dict:
            x, y, z = [
                np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3)
            ]
            ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor)

        RADIUS = 1000
        center_xy = 2 if pose3d.shape[0] == 14 else 14
        x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy,
                                                                     2]
        ax.set_xlim3d([-RADIUS + x, RADIUS + x])
        ax.set_ylim3d([-RADIUS + y, RADIUS + y])
        ax.set_zlim3d([-RADIUS + z, RADIUS + z])

        ax.set_xlabel("x")
        ax.set_ylabel("y")
        ax.set_zlabel("z")

    def draw2Dpose(pose2d,
                   ax,
                   lcolor="#3498db",
                   rcolor="#e74c3c",
                   add_labels=False):
        for i in joints_connectivity_dict:
            if pose2d[i[0], 2] and pose2d[i[1], 2]:
                x, y = [
                    np.array([pose2d[i[0], j], pose2d[i[1], j]])
                    for j in range(2)
                ]
                ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor)

    def draw_img_pose(pose3d,
                      pose2d=None,
                      frame=None,
                      figsize=(12, 12),
                      savepath=None):
        fig = plt.figure(figsize=figsize, dpi=80)
        # fig.clear()
        fig.tight_layout()

        ax = fig.add_subplot(221)
        if frame is not None:
            ax.imshow(frame, interpolation='nearest')
        if pose2d is not None:
            draw2Dpose(pose2d, ax)

        ax = fig.add_subplot(222, projection='3d')
        ax.view_init(45, 45)
        draw3Dpose(pose3d, ax)
        ax = fig.add_subplot(223, projection='3d')
        ax.view_init(0, 0)
        draw3Dpose(pose3d, ax)
        ax = fig.add_subplot(224, projection='3d')
        ax.view_init(0, 90)
        draw3Dpose(pose3d, ax)

        if savepath is not None:
            plt.savefig(savepath)
            plt.close()
        else:
            return fig

    def fig2data(fig):
        """
        fig = plt.figure()
        image = fig2data(fig)
        @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
        @param fig a matplotlib figure
        @return a numpy 3D array of RGBA values
        """
        # draw the renderer
        fig.canvas.draw()

        # Get the RGBA buffer from the figure
        w, h = fig.canvas.get_width_height()
        buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
        buf.shape = (w, h, 4)

        # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
        buf = np.roll(buf, 3, axis=2)
        image = Image.frombytes("RGBA", (w, h), buf.tostring())
        return image.convert("RGB")

    fig = draw_img_pose(pose3d, pose2d, frame=image)
    data = fig2data(fig)
    if returnimg is False:
        data.save(save_name)
    else:
        return data


================================================
FILE: ppdet/utils/voc_utils.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import os.path as osp
import re
import random

__all__ = ['create_list']


def create_list(devkit_dir, years, output_dir):
    """
    create following list:
        1. trainval.txt
        2. test.txt
    """
    trainval_list = []
    test_list = []
    for year in years:
        trainval, test = _walk_voc_dir(devkit_dir, year, output_dir)
        trainval_list.extend(trainval)
        test_list.extend(test)

    random.shuffle(trainval_list)
    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
        for item in trainval_list:
            ftrainval.write(item[0] + ' ' + item[1] + '\n')

    with open(osp.join(output_dir, 'test.txt'), 'w') as fval:
        ct = 0
        for item in test_list:
            ct += 1
            fval.write(item[0] + ' ' + item[1] + '\n')


def _get_voc_dir(devkit_dir, year, type):
    return osp.join(devkit_dir, 'VOC' + year, type)


def _walk_voc_dir(devkit_dir, year, output_dir):
    filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main')
    annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations')
    img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages')
    trainval_list = []
    test_list = []
    added = set()

    for _, _, files in os.walk(filelist_dir):
        for fname in files:
            img_ann_list = []
            if re.match(r'[a-z]+_trainval\.txt', fname):
                img_ann_list = trainval_list
            elif re.match(r'[a-z]+_test\.txt', fname):
                img_ann_list = test_list
            else:
                continue
            fpath = osp.join(filelist_dir, fname)
            for line in open(fpath):
                name_prefix = line.strip().split()[0]
                if name_prefix in added:
                    continue
                added.add(name_prefix)
                ann_path = osp.join(
                    osp.relpath(annotation_dir, output_dir),
                    name_prefix + '.xml')
                img_path = osp.join(
                    osp.relpath(img_dir, output_dir), name_prefix + '.jpg')
                img_ann_list.append((img_path, ann_path))

    return trainval_list, test_list


================================================
FILE: requirements.txt
================================================
numpy < 2.0
tqdm
typeguard
visualdl>=2.2.0
opencv-python <= 4.6.0
PyYAML
shapely
scipy
terminaltables
Cython
pycocotools
setuptools
Pillow

# for MOT evaluation and inference
lapx
motmetrics
sklearn==0.0

# for vehicleplate in deploy/pipeline/ppvehicle
pyclipper

# for culane data augumetation
imgaug>=0.4.0

================================================
FILE: scripts/build_wheel.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#=================================================
#                   Utils
#=================================================


# directory config
DIST_DIR="dist"
BUILD_DIR="build"
EGG_DIR="paddledet.egg-info"

CFG_DIR="configs"
TEST_DIR=".tests"
DATA_DIR="dataset"

# command line log config
RED='\033[0;31m'
BLUE='\033[0;34m'
GREEN='\033[1;32m'
BOLD='\033[1m'
NONE='\033[0m'

function python_version_check() {
  PY_MAIN_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
  PY_SUB_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
  echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "5" ]; then
    echo -e "${RED}FAIL:${NONE} please use Python >= 3.5 !"
    exit 1
  fi
}

function init() {
    echo -e "${BLUE}[init]${NONE} removing building directory..."
    rm -rf $DIST_DIR $BUILD_DIR $EGG_DIR $TEST_DIR
    if [ `pip list | grep paddledet | wc -l` -gt 0  ]; then
      echo -e "${BLUE}[init]${NONE} uninstalling paddledet..."
      pip uninstall -y paddledet
    fi
    echo -e "${BLUE}[init]${NONE} ${GREEN}init success\n"
}

function build_and_install() {
  echo -e "${BLUE}[build]${NONE} building paddledet wheel..."
  python setup.py sdist bdist_wheel
  if [ $? -ne 0 ]; then
    echo -e "${RED}[FAIL]${NONE} build paddledet wheel failed !"
    exit 1
  fi
  echo -e "${BLUE}[build]${NONE} ${GREEN}build paddldet wheel success\n"

  echo -e "${BLUE}[install]${NONE} installing paddledet..."
  cd $DIST_DIR
  find . -name "paddledet*.whl" | xargs pip install
  if [ $? -ne 0 ]; then
    cd ..
    echo -e "${RED}[FAIL]${NONE} install paddledet wheel failed !"
    exit 1
  fi
  echo -e "${BLUE}[install]${NONE} ${GREEN}paddledet install success\n"
  cd ..
}

function unittest() {
  if [ -d $TEST_DIR ]; then
    rm -rf $TEST_DIR
  fi;

  echo -e "${BLUE}[unittest]${NONE} run unittests..."

  # NOTE: perform unittests under TEST_DIR to
  #       make sure installed paddledet is used
  mkdir $TEST_DIR
  cp -r $CFG_DIR $TEST_DIR
  cp -r $DATA_DIR $TEST_DIR
  cd $TEST_DIR

  if [ $? != 0  ]; then
    exit 1
  fi
  find "../ppdet" -wholename '*tests/test_*' -type f -print0 | \
      xargs -0 -I{} -n1 -t bash -c  'python -u -s {}'

  # clean TEST_DIR
  cd ..
  rm -rf $TEST_DIR
  echo -e "${BLUE}[unittest]${NONE} ${GREEN}unittests success\n${NONE}"
}

function cleanup() {
  if [ -d $TEST_DIR ]; then
    rm -rf $TEST_DIR
  fi

  rm -rf $BUILD_DIR $EGG_DIR
  pip uninstall -y paddledet
}

function abort() {
  echo -e "${RED}[FAIL]${NONE} build wheel and unittest failed !
          please check your code" 1>&2

  cur_dir=`basename "$pwd"`
  if [ cur_dir==$TEST_DIR -o cur_dir==$DIST_DIR ]; then
    cd ..
  fi

  rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR $TEST_DIR
  pip uninstall -y paddledet
}

python_version_check

trap 'abort' 0
set -e

init
build_and_install
unittest
cleanup

# get Paddle version
PADDLE_VERSION=`python -c "import paddle; print(paddle.version.full_version)"`
PADDLE_COMMIT=`python -c "import paddle; print(paddle.version.commit)"`
PADDLE_COMMIT=`git rev-parse --short $PADDLE_COMMIT`

# get PaddleDetection branch
PPDET_BRANCH=`git rev-parse --abbrev-ref HEAD`
PPDET_COMMIT=`git rev-parse --short HEAD`

# get Python version
PYTHON_VERSION=`python -c "import platform; print(platform.python_version())"`

echo -e "\n${GREEN}paddledet wheel compiled and checked success !${NONE}
        ${BLUE}Python version:${NONE} $PYTHON_VERSION
        ${BLUE}Paddle version:${NONE} $PADDLE_VERSION ($PADDLE_COMMIT)
        ${BLUE}PaddleDetection branch:${NONE} $PPDET_BRANCH ($PPDET_COMMIT)\n"

echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist"

trap : 0


================================================
FILE: scripts/eval.sh
================================================
../../../../py37_meta_pd-2.4_cu11_comer/bin/python3.7 tools/eval.py \
    --config configs/artdetrv3_final/cortdetr_noisegroupx3_o2m_r18vd_120e_coco.yml \
    -o weights=outputs/

================================================
FILE: scripts/kill.sh
================================================
ps -ef | grep train.py | awk '{print $2}' | xargs kill -9
kill -9 $(lsof -t /dev/nvidia*)


================================================
FILE: scripts/train.sh
================================================
PY37=/root/paddlejob/workspace/env_run/ws/py37_meta_pd-2.4_cu11_comer/bin/python3.7
# PY37=../anaconda3/envs/py37_meta_pd-2.3.0_cu11/bin/python3.7
export CUDA_VISIBLE_DEVICES=0,1,2,3

nohup $PY37 -m paddle.distributed.launch --gpus=0,1,2,3 \
            tools/train.py \
            -c configs/artdetrv3/rtdetrv3_final_r18vd_6x_coco.yml --eval\
            -r output/rtdetrv3_final_r18vd_6x_coco/1 \
            -o save_dir=output/rtdetrv3_final_r18vd_6x_coco \
            &> output/train_rtdetrv3_final_r18vd_6x_coco.log&

================================================
FILE: tools/anchor_cluster.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.anchor_cluster')

from scipy.cluster.vq import kmeans
import numpy as np
from tqdm import tqdm

from ppdet.utils.cli import ArgsParser
from ppdet.utils.check import check_gpu, check_version, check_config
from ppdet.core.workspace import load_config, merge_config


class BaseAnchorCluster(object):
    def __init__(self, n, cache_path, cache, verbose=True):
        """
        Base Anchor Cluster

        Args:
            n (int): number of clusters
            cache_path (str): cache directory path
            cache (bool): whether using cache
            verbose (bool): whether print results
        """
        super(BaseAnchorCluster, self).__init__()
        self.n = n
        self.cache_path = cache_path
        self.cache = cache
        self.verbose = verbose

    def print_result(self, centers):
        raise NotImplementedError('%s.print_result is not available' %
                                  self.__class__.__name__)

    def get_whs(self):
        whs_cache_path = os.path.join(self.cache_path, 'whs.npy')
        shapes_cache_path = os.path.join(self.cache_path, 'shapes.npy')
        if self.cache and os.path.exists(whs_cache_path) and os.path.exists(
                shapes_cache_path):
            self.whs = np.load(whs_cache_path)
            self.shapes = np.load(shapes_cache_path)
            return self.whs, self.shapes
        whs = np.zeros((0, 2))
        shapes = np.zeros((0, 2))
        self.dataset.parse_dataset()
        roidbs = self.dataset.roidbs
        for rec in tqdm(roidbs):
            h, w = rec['h'], rec['w']
            bbox = rec['gt_bbox']
            wh = bbox[:, 2:4] - bbox[:, 0:2] + 1
            wh = wh / np.array([[w, h]])
            shape = np.ones_like(wh) * np.array([[w, h]])
            whs = np.vstack((whs, wh))
            shapes = np.vstack((shapes, shape))

        if self.cache:
            os.makedirs(self.cache_path, exist_ok=True)
            np.save(whs_cache_path, whs)
            np.save(shapes_cache_path, shapes)

        self.whs = whs
        self.shapes = shapes
        return self.whs, self.shapes

    def calc_anchors(self):
        raise NotImplementedError('%s.calc_anchors is not available' %
                                  self.__class__.__name__)

    def __call__(self):
        self.get_whs()
        centers = self.calc_anchors()
        if self.verbose:
            self.print_result(centers)
        return centers


class YOLOv2AnchorCluster(BaseAnchorCluster):
    def __init__(self,
                 n,
                 dataset,
                 size,
                 cache_path,
                 cache,
                 iters=1000,
                 verbose=True):
        super(YOLOv2AnchorCluster, self).__init__(
            n, cache_path, cache, verbose=verbose)
        """
        YOLOv2 Anchor Cluster

        The code is based on https://github.com/AlexeyAB/darknet/blob/master/scripts/gen_anchors.py

        Args:
            n (int): number of clusters
            dataset (DataSet): DataSet instance, VOC or COCO
            size (list): [w, h]
            cache_path (str): cache directory path
            cache (bool): whether using cache
            iters (int): kmeans algorithm iters
            verbose (bool): whether print results
        """
        self.dataset = dataset
        self.size = size
        self.iters = iters

    def print_result(self, centers):
        logger.info('%d anchor cluster result: [w, h]' % self.n)
        for w, h in centers:
            logger.info('[%d, %d]' % (round(w), round(h)))

    def metric(self, whs, centers):
        wh1 = whs[:, None]
        wh2 = centers[None]
        inter = np.minimum(wh1, wh2).prod(2)
        return inter / (wh1.prod(2) + wh2.prod(2) - inter)

    def kmeans_expectation(self, whs, centers, assignments):
        dist = self.metric(whs, centers)
        new_assignments = dist.argmax(1)
        converged = (new_assignments == assignments).all()
        return converged, new_assignments

    def kmeans_maximizations(self, whs, centers, assignments):
        new_centers = np.zeros_like(centers)
        for i in range(centers.shape[0]):
            mask = (assignments == i)
            if mask.sum():
                new_centers[i, :] = whs[mask].mean(0)
        return new_centers

    def calc_anchors(self):
        self.whs = self.whs * np.array([self.size])
        # random select k centers
        whs, n, iters = self.whs, self.n, self.iters
        logger.info('Running kmeans for %d anchors on %d points...' %
                    (n, len(whs)))
        idx = np.random.choice(whs.shape[0], size=n, replace=False)
        centers = whs[idx]
        assignments = np.zeros(whs.shape[0:1]) * -1
        # kmeans
        if n == 1:
            return self.kmeans_maximizations(whs, centers, assignments)

        pbar = tqdm(range(iters), desc='Cluster anchors with k-means algorithm')
        for _ in pbar:
            # E step
            converged, assignments = self.kmeans_expectation(whs, centers,
                                                             assignments)
            if converged:
                logger.info('kmeans algorithm has converged')
                break
            # M step
            centers = self.kmeans_maximizations(whs, centers, assignments)
            ious = self.metric(whs, centers)
            pbar.desc = 'avg_iou: %.4f' % (ious.max(1).mean())

        centers = sorted(centers, key=lambda x: x[0] * x[1])
        return centers


def main():
    parser = ArgsParser()
    parser.add_argument(
        '--n', '-n', default=9, type=int, help='num of clusters')
    parser.add_argument(
        '--iters',
        '-i',
        default=1000,
        type=int,
        help='num of iterations for kmeans')
    parser.add_argument(
        '--verbose', '-v', default=True, type=bool, help='whether print result')
    parser.add_argument(
        '--size',
        '-s',
        default=None,
        type=str,
        help='image size: w,h, using comma as delimiter')
    parser.add_argument(
        '--method',
        '-m',
        default='v2',
        type=str,
        help='cluster method, v2 is only supported now')
    parser.add_argument(
        '--cache_path', default='cache', type=str, help='cache path')
    parser.add_argument(
        '--cache', action='store_true', help='whether use cache')
    FLAGS = parser.parse_args()

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    if 'use_gpu' not in cfg:
        cfg.use_gpu = False
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version('develop')

    # get dataset
    dataset = cfg['TrainDataset']
    if FLAGS.size:
        if ',' in FLAGS.size:
            size = list(map(int, FLAGS.size.split(',')))
            assert len(size) == 2, "the format of size is incorrect"
        else:
            size = int(FLAGS.size)
            size = [size, size]
    elif 'inputs_def' in cfg['TestReader'] and 'image_shape' in cfg[
            'TestReader']['inputs_def']:
        size = cfg['TestReader']['inputs_def']['image_shape'][1:]
    else:
        raise ValueError('size is not specified')

    if FLAGS.method == 'v2':
        cluster = YOLOv2AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path,
                                      FLAGS.cache, FLAGS.iters, FLAGS.verbose)
    else:
        raise ValueError('cluster method: %s is not supported' % FLAGS.method)

    anchors = cluster()


if __name__ == "__main__":
    main()


================================================
FILE: tools/box_distribution.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import matplotlib.pyplot as plt
import json
import numpy as np
import argparse
from pycocotools.coco import COCO
from tqdm import tqdm


def median(data):
    data.sort()
    mid = len(data) // 2
    median = (data[mid] + data[~mid]) / 2
    return median


def draw_distribution(width, height, out_path):
    w_bins = int((max(width) - min(width)) // 10)
    h_bins = int((max(height) - min(height)) // 10)
    plt.figure()
    plt.subplot(221)
    plt.hist(width, bins=w_bins, color='green')
    plt.xlabel('Width rate *1000')
    plt.ylabel('number')
    plt.title('Distribution of Width')
    plt.subplot(222)
    plt.hist(height, bins=h_bins, color='blue')
    plt.xlabel('Height rate *1000')
    plt.title('Distribution of Height')
    plt.savefig(out_path)
    print(f'Distribution saved as {out_path}')
    plt.show()


def get_ratio_infos(jsonfile, out_img, eval_size, small_stride):
    coco = COCO(annotation_file=jsonfile)
    allannjson = json.load(open(jsonfile, 'r'))
    be_im_id = allannjson['annotations'][0]['image_id']
    be_im_w = []
    be_im_h = []
    ratio_w = []
    ratio_h = []
    im_wid, im_hei = [], []
    for ann in tqdm(allannjson['annotations']):
        if ann['iscrowd']:
            continue
        x0, y0, w, h = ann['bbox'][:]
        if be_im_id == ann['image_id']:
            be_im_w.append(w)
            be_im_h.append(h)
        else:
            im_w = coco.imgs[be_im_id]['width']
            im_h = coco.imgs[be_im_id]['height']
            im_wid.append(im_w)
            im_hei.append(im_h)
            im_m_w = np.mean(be_im_w)
            im_m_h = np.mean(be_im_h)
            dis_w = im_m_w / im_w
            dis_h = im_m_h / im_h
            ratio_w.append(dis_w)
            ratio_h.append(dis_h)
            be_im_id = ann['image_id']
            be_im_w = [w]
            be_im_h = [h]

    im_w = coco.imgs[be_im_id]['width']
    im_h = coco.imgs[be_im_id]['height']
    im_wid.append(im_w)
    im_hei.append(im_h)
    all_im_m_w = np.mean(im_wid)
    all_im_m_h = np.mean(im_hei)

    im_m_w = np.mean(be_im_w)
    im_m_h = np.mean(be_im_h)
    dis_w = im_m_w / im_w
    dis_h = im_m_h / im_h
    ratio_w.append(dis_w)
    ratio_h.append(dis_h)
    mid_w = median(ratio_w)
    mid_h = median(ratio_h)

    reg_ratio = []
    ratio_all = ratio_h + ratio_w
    for r in ratio_all:
        if r < 0.2:
            reg_ratio.append(r)
        elif r < 0.4:
            reg_ratio.append(r / 2)
        else:
            reg_ratio.append(r / 4)
    reg_ratio = sorted(reg_ratio)
    max_ratio = reg_ratio[int(0.95 * len(reg_ratio))]
    reg_max = round(max_ratio * eval_size / small_stride)

    ratio_w = [i * 1000 for i in ratio_w]
    ratio_h = [i * 1000 for i in ratio_h]
    print(f'Suggested reg_range[1] is {reg_max+1}')
    print(f'Mean of all img_w is {all_im_m_w}')
    print(f'Mean of all img_h is {all_im_m_h}')
    print(f'Median of ratio_w is {mid_w}')
    print(f'Median of ratio_h is {mid_h}')
    print('all_img with box: ', len(ratio_h))
    print('all_ann: ', len(allannjson['annotations']))
    draw_distribution(ratio_w, ratio_h, out_img)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--json_path', type=str, default=None, help="Dataset json path.")
    parser.add_argument('--eval_size', type=int, default=640, help="eval size.")
    parser.add_argument(
        '--small_stride', type=int, default=8, help="smallest stride.")
    parser.add_argument(
        '--out_img',
        type=str,
        default='box_distribution.jpg',
        help="Name of distibution img.")
    args = parser.parse_args()

    get_ratio_infos(args.json_path, args.out_img, args.eval_size,
                    args.small_stride)


if __name__ == "__main__":
    main()


================================================
FILE: tools/cam_ppdet.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')
from ppdet.utils.cli import ArgsParser, merge_args
from ppdet.core.workspace import load_config, merge_config
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_version, check_config
from ppdet.utils.cam_utils import BBoxCAM
import paddle


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--infer_img",
        type=str,
        default='demo/000000014439.jpg',    # hxw: 404x640
        help="Image path, has higher priority over --infer_dir")
    parser.add_argument("--weights",
                        type=str,
                        default='output/faster_rcnn_r50_vd_fpn_2x_coco_paddlejob/best_model.pdparams'
                        )
    parser.add_argument("--cam_out",
                        type=str,
                        default='cam_faster_rcnn'
                        )
    parser.add_argument("--use_gpu",
                        type=bool,
                        default=True)
    parser.add_argument(
        "--infer_dir",
        type=str,
        default=None,
        help="Directory for images to perform inference on.")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Directory for storing the output visualization files.")
    parser.add_argument(
        "--draw_threshold",
        type=float,
        default=0.8,
        help="Threshold to reserve the result for visualization.")
    parser.add_argument(
        "--save_results",
        type=bool,
        default=False,
        help="Whether to save inference results to output_dir.")
    parser.add_argument(
        "--target_feature_layer_name",
        type=str,
        default='model.backbone', # define the featuremap to show grad cam, such as model.backbone, model.bbox_head.roi_extractor
        help="Whether to save inference results to output_dir.")
    args = parser.parse_args()

    return args

def run(FLAGS, cfg):
    assert cfg.architecture in ['FasterRCNN', 'MaskRCNN', 'YOLOv3', 'PPYOLOE',
                                'PPYOLOEWithAuxHead', 'BlazeFace', 'SSD', 'RetinaNet'],  \
        'Only supported cam for faster_rcnn based and yolov3 based architecture for now,  ' \
        'the others are not supported temporarily!'

    bbox_cam = BBoxCAM(FLAGS, cfg)
    bbox_cam.get_bboxes_cams()

    print('finish')


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_args(cfg, FLAGS)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    else:
        place = paddle.set_device('cpu')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/eval.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import paddle

from ppdet.core.workspace import create, load_config, merge_config
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
from ppdet.utils.cli import ArgsParser, merge_args
from ppdet.engine import Trainer, Trainer_ARSL, init_parallel_env
from ppdet.metrics.coco_utils import json_eval_results
from ppdet.slim import build_slim_model

from ppdet.utils.logger import setup_logger
logger = setup_logger('eval')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--output_eval",
        default=None,
        type=str,
        help="Evaluation directory, default is current directory.")

    parser.add_argument(
        '--json_eval',
        action='store_true',
        default=False,
        help='Whether to re eval with already exists bbox.json or mask.json')

    parser.add_argument(
        "--slim_config",
        default=None,
        type=str,
        help="Configuration file of slim method.")

    # TODO: bias should be unified
    parser.add_argument(
        "--bias",
        action="store_true",
        help="whether add bias or not while getting w and h")

    parser.add_argument(
        "--classwise",
        action="store_true",
        help="whether per-category AP and draw P-R Curve or not.")

    parser.add_argument(
        '--save_prediction_only',
        action='store_true',
        default=False,
        help='Whether to save the evaluation results only')

    parser.add_argument(
        "--amp",
        action='store_true',
        default=False,
        help="Enable auto mixed precision eval.")

    # for smalldet slice_infer
    parser.add_argument(
        "--slice_infer",
        action='store_true',
        help="Whether to slice the image and merge the inference results for small object detection."
    )
    parser.add_argument(
        '--slice_size',
        nargs='+',
        type=int,
        default=[640, 640],
        help="Height of the sliced image.")
    parser.add_argument(
        "--overlap_ratio",
        nargs='+',
        type=float,
        default=[0.25, 0.25],
        help="Overlap height ratio of the sliced image.")
    parser.add_argument(
        "--combine_method",
        type=str,
        default='nms',
        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
    )
    parser.add_argument(
        "--match_threshold",
        type=float,
        default=0.6,
        help="Combine method matching threshold.")
    parser.add_argument(
        "--match_metric",
        type=str,
        default='ios',
        help="Combine method matching metric, choose in ['iou', 'ios'].")
    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    if FLAGS.json_eval:
        logger.info(
            "In json_eval mode, PaddleDetection will evaluate json files in "
            "output_eval directly. And proposal.json, bbox.json and mask.json "
            "will be detected by default.")
        json_eval_results(
            cfg.metric,
            json_directory=FLAGS.output_eval,
            dataset=create('EvalDataset')())
        return

    # init parallel environment if nranks > 1
    init_parallel_env()
    ssod_method = cfg.get('ssod_method', None)
    if ssod_method == 'ARSL':
        # build ARSL_trainer
        trainer = Trainer_ARSL(cfg, mode='eval')
        # load ARSL_weights
        trainer.load_weights(cfg.weights, ARSL_eval=True)
    else:
        # build trainer
        trainer = Trainer(cfg, mode='eval')
        #load weights
        trainer.load_weights(cfg.weights)

    # training
    if FLAGS.slice_infer:
        trainer.evaluate_slice(
            slice_size=FLAGS.slice_size,
            overlap_ratio=FLAGS.overlap_ratio,
            combine_method=FLAGS.combine_method,
            match_threshold=FLAGS.match_threshold,
            match_metric=FLAGS.match_metric)
    else:
        trainer.evaluate()


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_args(cfg, FLAGS)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    if FLAGS.slim_config:
        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='eval')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_mlu(cfg.use_mlu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/eval_mot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import paddle

from ppdet.core.workspace import load_config, merge_config
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
from ppdet.utils.cli import ArgsParser
from ppdet.engine import Tracker


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--det_results_dir",
        type=str,
        default='',
        help="Directory name for detection results.")
    parser.add_argument(
        '--output_dir',
        type=str,
        default='output',
        help='Directory name for output tracking results.')
    parser.add_argument(
        '--save_images',
        action='store_true',
        help='Save tracking results (image).')
    parser.add_argument(
        '--save_videos',
        action='store_true',
        help='Save tracking results (video).')
    parser.add_argument(
        '--show_image',
        action='store_true',
        help='Show tracking results (image).')
    parser.add_argument(
        '--scaled',
        type=bool,
        default=False,
        help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 "
        "True in general detector.")
    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    dataset_dir = cfg['EvalMOTDataset'].dataset_dir
    data_root = cfg['EvalMOTDataset'].data_root
    data_root = '{}/{}'.format(dataset_dir, data_root)
    seqs = os.listdir(data_root)
    seqs.sort()

    # build Tracker
    tracker = Tracker(cfg, mode='eval')

    # load weights
    if cfg.architecture in ['DeepSORT', 'ByteTrack']:
        tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights)
    else:
        tracker.load_weights_jde(cfg.weights)

    # inference
    tracker.mot_evaluate(
        data_root=data_root,
        seqs=seqs,
        data_type=cfg.metric.lower(),
        model_type=cfg.architecture,
        output_dir=FLAGS.output_dir,
        save_images=FLAGS.save_images,
        save_videos=FLAGS.save_videos,
        show_image=FLAGS.show_image,
        scaled=FLAGS.scaled,
        det_results_dir=FLAGS.det_results_dir)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_mlu(cfg.use_mlu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/export_model.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import paddle
from ppdet.core.workspace import load_config, merge_config
from ppdet.utils.check import check_gpu, check_version, check_config
from ppdet.utils.cli import ArgsParser
from ppdet.engine import Trainer
from ppdet.engine.trainer_ssod import Trainer_ARSL
from ppdet.slim import build_slim_model

from ppdet.utils.logger import setup_logger
logger = setup_logger('export_model')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output_inference",
        help="Directory for storing the output model files.")
    parser.add_argument(
        "--export_serving_model",
        type=bool,
        default=False,
        help="Whether to export serving model or not.")
    parser.add_argument(
        "--slim_config",
        default=None,
        type=str,
        help="Configuration file of slim method.")
    parser.add_argument("--for_fd", action='store_true')
    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    ssod_method = cfg.get('ssod_method', None)
    if ssod_method is not None and ssod_method == 'ARSL':
        trainer = Trainer_ARSL(cfg, mode='test')
        trainer.load_weights(cfg.weights, ARSL_eval=True)
    # build detector
    else:
        trainer = Trainer(cfg, mode='test')

        # load weights
        if cfg.architecture in ['DeepSORT', 'ByteTrack']:
            trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights)
        else:
            trainer.load_weights(cfg.weights)

    # export model
    trainer.export(FLAGS.output_dir, for_fd=FLAGS.for_fd)

    if FLAGS.export_serving_model:
        assert not FLAGS.for_fd
        from paddle_serving_client.io import inference_model_to_serving
        model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0]

        inference_model_to_serving(
            dirname="{}/{}".format(FLAGS.output_dir, model_name),
            serving_server="{}/{}/serving_server".format(FLAGS.output_dir,
                                                         model_name),
            serving_client="{}/{}/serving_client".format(FLAGS.output_dir,
                                                         model_name),
            model_filename="model.pdmodel",
            params_filename="model.pdiparams")


def main():
    paddle.set_device("cpu")
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)

    if FLAGS.slim_config:
        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')

    # FIXME: Temporarily solve the priority problem of FLAGS.opt
    merge_config(FLAGS.opt)
    check_config(cfg)
    if 'use_gpu' not in cfg:
        cfg.use_gpu = False
    check_gpu(cfg.use_gpu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/gen_semi_coco.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import json
import argparse
import numpy as np


def save_json(path, images, annotations, categories):
    new_json = {
        'images': images,
        'annotations': annotations,
        'categories': categories,
    }
    with open(path, 'w') as f:
        json.dump(new_json, f)
    print('{} saved, with {} images and {} annotations.'.format(
        path, len(images), len(annotations)))


def gen_semi_data(data_dir,
                  json_file,
                  percent=10.0,
                  seed=1,
                  seed_offset=0,
                  txt_file=None):
    json_name = json_file.split('/')[-1].split('.')[0]
    json_file = os.path.join(data_dir, json_file)
    anno = json.load(open(json_file, 'r'))
    categories = anno['categories']
    all_images = anno['images']
    all_anns = anno['annotations']
    print(
        'Totally {} images and {} annotations, about {} gts per image.'.format(
            len(all_images), len(all_anns), len(all_anns) / len(all_images)))

    if txt_file:
        print('Using percent {} and seed {}.'.format(percent, seed))
        txt_file = os.path.join(data_dir, txt_file)
        sup_idx = json.load(open(txt_file, 'r'))[str(percent)][str(seed)]
        # max(sup_idx) = 117262 # 10%, sup_idx is not image_id
    else:
        np.random.seed(seed + seed_offset)
        sup_len = int(percent / 100.0 * len(all_images))
        sup_idx = np.random.choice(
            range(len(all_images)), size=sup_len, replace=False)
    labeled_images, labeled_anns = [], []
    labeled_im_ids = []
    unlabeled_images, unlabeled_anns = [], []

    for i in range(len(all_images)):
        if i in sup_idx:
            labeled_im_ids.append(all_images[i]['id'])
            labeled_images.append(all_images[i])
        else:
            unlabeled_images.append(all_images[i])

    for an in all_anns:
        im_id = an['image_id']
        if im_id in labeled_im_ids:
            labeled_anns.append(an)
        else:
            continue

    save_path = '{}/{}'.format(data_dir, 'semi_annotations')
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    sup_name = '{}.{}@{}.json'.format(json_name, seed, int(percent))
    sup_path = os.path.join(save_path, sup_name)
    save_json(sup_path, labeled_images, labeled_anns, categories)

    unsup_name = '{}.{}@{}-unlabeled.json'.format(json_name, seed, int(percent))
    unsup_path = os.path.join(save_path, unsup_name)
    save_json(unsup_path, unlabeled_images, unlabeled_anns, categories)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='./dataset/coco')
    parser.add_argument(
        '--json_file', type=str, default='annotations/instances_train2017.json')
    parser.add_argument('--percent', type=float, default=10.0)
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--seed_offset', type=int, default=0)
    parser.add_argument('--txt_file', type=str, default='COCO_supervision.txt')
    args = parser.parse_args()
    print(args)
    gen_semi_data(args.data_dir, args.json_file, args.percent, args.seed,
                  args.seed_offset, args.txt_file)


================================================
FILE: tools/infer.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')
import glob
import ast

import paddle
from ppdet.core.workspace import create, load_config, merge_config
from ppdet.engine import Trainer, Trainer_ARSL
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
from ppdet.utils.cli import ArgsParser, merge_args
from ppdet.slim import build_slim_model

from ppdet.utils.logger import setup_logger
logger = setup_logger('train')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--infer_dir",
        type=str,
        default=None,
        help="Directory for images to perform inference on.")
    parser.add_argument(
        "--infer_list",
        type=str,
        default=None,
        help="The file path containing path of image to be infered. Valid only when --infer_dir is given."
    )
    parser.add_argument(
        "--infer_img",
        type=str,
        default=None,
        help="Image path, has higher priority over --infer_dir")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Directory for storing the output visualization files.")
    parser.add_argument(
        "--draw_threshold",
        type=float,
        default=0.5,
        help="Threshold to reserve the result for visualization.")
    parser.add_argument(
        "--save_threshold",
        type=float,
        default=0.5,
        help="Threshold to reserve the result for saving.")
    parser.add_argument(
        "--slim_config",
        default=None,
        type=str,
        help="Configuration file of slim method.")
    parser.add_argument(
        "--use_vdl",
        type=bool,
        default=False,
        help="Whether to record the data to VisualDL.")
    parser.add_argument(
        "--do_eval",
        type=ast.literal_eval,
        default=False,
        help="Whether to eval after infer.")
    parser.add_argument(
        '--vdl_log_dir',
        type=str,
        default="vdl_log_dir/image",
        help='VisualDL logging directory for image.')
    parser.add_argument(
        "--save_results",
        type=bool,
        default=False,
        help="Whether to save inference results to output_dir.")
    parser.add_argument(
        "--slice_infer",
        action='store_true',
        help="Whether to slice the image and merge the inference results for small object detection."
    )
    parser.add_argument(
        '--slice_size',
        nargs='+',
        type=int,
        default=[640, 640],
        help="Height of the sliced image.")
    parser.add_argument(
        "--overlap_ratio",
        nargs='+',
        type=float,
        default=[0.25, 0.25],
        help="Overlap height ratio of the sliced image.")
    parser.add_argument(
        "--combine_method",
        type=str,
        default='nms',
        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
    )
    parser.add_argument(
        "--match_threshold",
        type=float,
        default=0.6,
        help="Combine method matching threshold.")
    parser.add_argument(
        "--match_metric",
        type=str,
        default='ios',
        help="Combine method matching metric, choose in ['iou', 'ios'].")
    parser.add_argument(
        "--visualize",
        type=ast.literal_eval,
        default=True,
        help="Whether to save visualize results to output_dir.")
    parser.add_argument(
        "--rtn_im_file",
        type=bool,
        default=False,
        help="Whether to return image file path in Dataloader.")
    args = parser.parse_args()
    return args


def get_test_images(infer_dir, infer_img, infer_list=None):
    """
    Get image path list in TEST mode
    """
    assert infer_img is not None or infer_dir is not None, \
        "--infer_img or --infer_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), \
            "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), \
            "{} is not a directory".format(infer_dir)

    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]

    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), \
        "infer_dir {} is not a directory".format(infer_dir)
    if infer_list:
        assert os.path.isfile(
            infer_list), f"infer_list {infer_list} is not a valid file path."
        with open(infer_list, 'r') as f:
            lines = f.readlines()
        for line in lines:
            images.update([os.path.join(infer_dir, line.strip())])
    else:
        exts = ['jpg', 'jpeg', 'png', 'bmp']
        exts += [ext.upper() for ext in exts]
        for ext in exts:
            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)
    assert len(images) > 0, "no image found in {}".format(infer_dir)
    logger.info("Found {} inference images in total.".format(len(images)))

    return images


def run(FLAGS, cfg):
    if FLAGS.rtn_im_file:
        cfg['TestReader']['sample_transforms'][0]['Decode'][
            'rtn_im_file'] = FLAGS.rtn_im_file
    ssod_method = cfg.get('ssod_method', None)
    if ssod_method == 'ARSL':
        trainer = Trainer_ARSL(cfg, mode='test')
        trainer.load_weights(cfg.weights, ARSL_eval=True)
    else:
        trainer = Trainer(cfg, mode='test')
        trainer.load_weights(cfg.weights)
    # get inference images
    if FLAGS.do_eval:
        dataset = create('TestDataset')()
        images = dataset.get_images()
    else:
        images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img, FLAGS.infer_list)

    # inference
    if FLAGS.slice_infer:
        trainer.slice_predict(
            images,
            slice_size=FLAGS.slice_size,
            overlap_ratio=FLAGS.overlap_ratio,
            combine_method=FLAGS.combine_method,
            match_threshold=FLAGS.match_threshold,
            match_metric=FLAGS.match_metric,
            draw_threshold=FLAGS.draw_threshold,
            output_dir=FLAGS.output_dir,
            save_results=FLAGS.save_results,
            visualize=FLAGS.visualize)
    else:
        trainer.predict(
            images,
            draw_threshold=FLAGS.draw_threshold,
            output_dir=FLAGS.output_dir,
            save_results=FLAGS.save_results,
            visualize=FLAGS.visualize,
            save_threshold=FLAGS.save_threshold,
            do_eval=FLAGS.do_eval)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_args(cfg, FLAGS)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    if FLAGS.slim_config:
        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_mlu(cfg.use_mlu)
    check_version()
    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/infer_culane.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')
import glob
import ast

import paddle
from ppdet.core.workspace import load_config, merge_config
from ppdet.engine import Trainer
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
from ppdet.utils.cli import ArgsParser, merge_args
from ppdet.slim import build_slim_model

from ppdet.utils.logger import setup_logger
logger = setup_logger('train')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--infer_dir",
        type=str,
        default=None,
        help="Directory for images to perform inference on.")
    parser.add_argument(
        "--infer_img",
        type=str,
        default=None,
        help="Image path, has higher priority over --infer_dir")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Directory for storing the output visualization files.")
    parser.add_argument(
        "--save_results",
        type=bool,
        default=False,
        help="Whether to save inference results to output_dir.")
    parser.add_argument(
        "--visualize",
        type=ast.literal_eval,
        default=True,
        help="Whether to save visualize results to output_dir.")
    args = parser.parse_args()
    return args


def get_test_images(infer_dir, infer_img):
    """
    Get image path list in TEST mode
    """
    assert infer_img is not None or infer_dir is not None, \
        "--infer_img or --infer_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), \
            "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), \
            "{} is not a directory".format(infer_dir)

    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]

    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), \
        "infer_dir {} is not a directory".format(infer_dir)
    exts = ['jpg', 'jpeg', 'png', 'bmp']
    exts += [ext.upper() for ext in exts]
    for ext in exts:
        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)

    assert len(images) > 0, "no image found in {}".format(infer_dir)
    logger.info("Found {} inference images in total.".format(len(images)))

    return images


def run(FLAGS, cfg):
    # build trainer
    trainer = Trainer(cfg, mode='test')

    # load weights
    trainer.load_weights(cfg.weights)

    # get inference images
    images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img)

    trainer.predict_culane(
        images,
        output_dir=FLAGS.output_dir,
        save_results=FLAGS.save_results,
        visualize=FLAGS.visualize)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_args(cfg, FLAGS)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_mlu(cfg.use_mlu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()

================================================
FILE: tools/infer_mot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import paddle
from ppdet.core.workspace import load_config, merge_config
from ppdet.engine import Tracker
from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
from ppdet.utils.cli import ArgsParser


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        '--video_file', type=str, default=None, help='Video name for tracking.')
    parser.add_argument(
        '--frame_rate',
        type=int,
        default=-1,
        help='Video frame rate for tracking.')
    parser.add_argument(
        "--image_dir",
        type=str,
        default=None,
        help="Directory for images to perform inference on.")
    parser.add_argument(
        "--det_results_dir",
        type=str,
        default='',
        help="Directory name for detection results.")
    parser.add_argument(
        '--output_dir',
        type=str,
        default='output',
        help='Directory name for output tracking results.')
    parser.add_argument(
        '--save_images',
        action='store_true',
        help='Save tracking results (image).')
    parser.add_argument(
        '--save_videos',
        action='store_true',
        help='Save tracking results (video).')
    parser.add_argument(
        '--show_image',
        action='store_true',
        help='Show tracking results (image).')
    parser.add_argument(
        '--scaled',
        type=bool,
        default=False,
        help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 "
        "True in general detector.")
    parser.add_argument(
        "--draw_threshold",
        type=float,
        default=0.5,
        help="Threshold to reserve the result for visualization.")
    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    # build Tracker
    tracker = Tracker(cfg, mode='test')

    # load weights
    if cfg.architecture in ['DeepSORT', 'ByteTrack']:
        tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights)
    else:
        tracker.load_weights_jde(cfg.weights)

    # inference
    tracker.mot_predict_seq(
        video_file=FLAGS.video_file,
        frame_rate=FLAGS.frame_rate,
        image_dir=FLAGS.image_dir,
        data_type=cfg.metric.lower(),
        model_type=cfg.architecture,
        output_dir=FLAGS.output_dir,
        save_images=FLAGS.save_images,
        save_videos=FLAGS.save_videos,
        show_image=FLAGS.show_image,
        scaled=FLAGS.scaled,
        det_results_dir=FLAGS.det_results_dir,
        draw_threshold=FLAGS.draw_threshold)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_npu(cfg.use_npu)
    check_xpu(cfg.use_xpu)
    check_mlu(cfg.use_mlu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/post_quant.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import paddle

from ppdet.core.workspace import load_config, merge_config
from ppdet.utils.check import check_gpu, check_version, check_config
from ppdet.utils.cli import ArgsParser
from ppdet.engine import Trainer
from ppdet.slim import build_slim_model

from ppdet.utils.logger import setup_logger
logger = setup_logger('post_quant')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output_inference",
        help="Directory for storing the output model files.")
    parser.add_argument(
        "--slim_config",
        default=None,
        type=str,
        help="Configuration file of slim method.")
    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    # build detector
    trainer = Trainer(cfg, mode='eval')

    # load weights
    if cfg.architecture in ['DeepSORT']:
        if cfg.det_weights != 'None':
            trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights)
        else:
            trainer.load_weights_sde(None, cfg.reid_weights)
    else:
        trainer.load_weights(cfg.weights)

    # post quant model
    trainer.post_quant(FLAGS.output_dir)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    # TODO: to be refined in the future
    if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn':
        FLAGS.opt['norm_type'] = 'bn'
    merge_config(FLAGS.opt)

    if FLAGS.slim_config:
        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')

    # FIXME: Temporarily solve the priority problem of FLAGS.opt
    merge_config(FLAGS.opt)
    check_config(cfg)
    if 'use_gpu' not in cfg:
        cfg.use_gpu = False
    check_gpu(cfg.use_gpu)
    check_version()

    run(FLAGS, cfg)


if __name__ == '__main__':
    main()


================================================
FILE: tools/slice_image.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from tqdm import tqdm


def slice_data(image_dir, dataset_json_path, output_dir, slice_size,
               overlap_ratio):
    try:
        from sahi.scripts.slice_coco import slice
    except Exception as e:
        raise RuntimeError(
            'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi'
        )
    tqdm.write(
        f" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}")
    slice(
        image_dir=image_dir,
        dataset_json_path=dataset_json_path,
        output_dir=output_dir,
        slice_size=slice_size,
        overlap_ratio=overlap_ratio, )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_dir', type=str, default=None, help="The image folder path.")
    parser.add_argument(
        '--json_path', type=str, default=None, help="Dataset json path.")
    parser.add_argument(
        '--output_dir', type=str, default=None, help="Output dir.")
    parser.add_argument(
        '--slice_size', type=int, default=500, help="slice_size")
    parser.add_argument(
        '--overlap_ratio', type=float, default=0.25, help="overlap_ratio")
    args = parser.parse_args()

    slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size,
               args.overlap_ratio)


if __name__ == "__main__":
    main()


================================================
FILE: tools/sniper_params_stats.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import json
import logging
import numpy as np

from ppdet.utils.logger import setup_logger
logger = setup_logger('sniper_params_stats')

def get_default_params(architecture):
    """get_default_params"""
    if architecture == "FasterRCNN":
        anchor_range = np.array([64., 512.])  # for frcnn-fpn
        # anchor_range = np.array([16., 373.])  # for yolov3
        # anchor_range = np.array([32., 373.])  # for yolov3
        default_crop_size = 1536  # mod 32 for frcnn-fpn
        default_max_bbox_size = 352
    elif architecture == "YOLOv3":
        anchor_range = np.array([32., 373.])  # for yolov3
        default_crop_size = 800  # mod 32 for yolov3
        default_max_bbox_size = 352
    else:
        raise NotImplementedError

    return anchor_range, default_crop_size, default_max_bbox_size


def get_box_ratios(anno_file):
    """
    get_size_ratios
    :param anno_file: coco anno flile
    :return: size_ratio: (box_long_size / pic_long_size)
    """
    coco_dict = json.load(open(anno_file))
    image_list = coco_dict['images']
    anno_list = coco_dict['annotations']

    image_id2hw = {}
    for im_dict in image_list:
        im_id = im_dict['id']
        h, w = im_dict['height'], im_dict['width']
        image_id2hw[im_id] = (h, w)

    box_ratios = []
    for a_dict in anno_list:
        im_id = a_dict['image_id']
        im_h, im_w = image_id2hw[im_id]
        bbox = a_dict['bbox']
        x1, y1, w, h = bbox
        pic_long = max(im_h, im_w)
        box_long = max(w, h)
        box_ratios.append(box_long / pic_long)

    return np.array(box_ratios)


def get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p2, box_ratio_p98):
    """get_scale_and_ratios"""
    anchor_better_low, anchor_better_high = anchor_range  # (60., 512.)
    anchor_center = np.sqrt(anchor_better_high * anchor_better_low)

    anchor_log_range = np.log10(anchor_better_high) - np.log10(anchor_better_low)
    box_ratio_log_range = np.log10(box_ratio_p98) - np.log10(box_ratio_p2)
    logger.info("anchor_log_range:{}, box_ratio_log_range:{}".format(anchor_log_range, box_ratio_log_range))

    box_cut_num = int(np.ceil(box_ratio_log_range / anchor_log_range))
    box_ratio_log_window = box_ratio_log_range / box_cut_num
    logger.info("box_cut_num:{}, box_ratio_log_window:{}".format(box_cut_num, box_ratio_log_window))

    image_target_sizes = []
    valid_ratios = []
    for i in range(box_cut_num):
        # # method1: align center
        # box_ratio_log_center = np.log10(p2) + 0.5 * box_ratio_log_window + i * box_ratio_log_window
        # box_ratio_center = np.power(10, box_ratio_log_center)
        # scale = anchor_center / box_ratio_center
        # method2: align left low
        box_ratio_low = np.power(10, np.log10(box_ratio_p2) + i * box_ratio_log_window)
        image_target_size = anchor_better_low / box_ratio_low

        image_target_sizes.append(int(image_target_size))
        valid_ratio = anchor_range / image_target_size
        valid_ratios.append(valid_ratio.tolist())

        logger.info("Box cut {}".format(i))
        logger.info("box_ratio_low: {}".format(box_ratio_low))
        logger.info("image_target_size: {}".format(image_target_size))
        logger.info("valid_ratio: {}".format(valid_ratio))

    return image_target_sizes, valid_ratios


def get_valid_ranges(valid_ratios):
    """
    get_valid_box_ratios_range
    :param valid_ratios:
    :return:
    """
    valid_ranges = []
    if len(valid_ratios) == 1:
        valid_ranges.append([-1, -1])
    else:
        for i, vratio in enumerate(valid_ratios):
            if i == 0:
                valid_ranges.append([-1, vratio[1]])
            elif i == len(valid_ratios) - 1:
                valid_ranges.append([vratio[0], -1])
            else:
                valid_ranges.append(vratio)
    return valid_ranges


def get_percentile(a_array, low_percent, high_percent):
    """
    get_percentile
    :param low_percent:
    :param high_percent:
    :return:
    """
    array_p0 = min(a_array)
    array_p100 = max(a_array)
    array_plow = np.percentile(a_array, low_percent)
    array_phigh = np.percentile(a_array, high_percent)
    logger.info(
        "array_percentile(0): {},array_percentile low({}): {}, "
        "array_percentile high({}): {}, array_percentile 100: {}".format(
            array_p0, low_percent, array_plow, high_percent, array_phigh, array_p100))
    return array_plow, array_phigh


def sniper_anno_stats(architecture, anno_file):
    """
    sniper_anno_stats
    :param anno_file:
    :return:
    """

    anchor_range, default_crop_size, default_max_bbox_size = get_default_params(architecture)

    box_ratios = get_box_ratios(anno_file)

    box_ratio_p8, box_ratio_p92 = get_percentile(box_ratios, 8, 92)

    image_target_sizes, valid_box_ratios = get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p8, box_ratio_p92)

    valid_ranges = get_valid_ranges(valid_box_ratios)

    crop_size = min(default_crop_size, min([item for item in image_target_sizes]))
    crop_size = int(np.ceil(crop_size / 32.) * 32.)
    crop_stride = max(min(default_max_bbox_size, crop_size), crop_size - default_max_bbox_size)
    logger.info("Result".center(100, '-'))
    logger.info("image_target_sizes: {}".format(image_target_sizes))
    logger.info("valid_box_ratio_ranges: {}".format(valid_ranges))
    logger.info("chip_target_size: {}, chip_target_stride: {}".format(crop_size, crop_stride))

    return {
        "image_target_sizes": image_target_sizes,
        "valid_box_ratio_ranges": valid_ranges,
        "chip_target_size": crop_size,
        "chip_target_stride": crop_stride
    }

if __name__=="__main__":
    architecture, anno_file = sys.argv[1], sys.argv[2]
    sniper_anno_stats(architecture, anno_file)


================================================
FILE: tools/train.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

# ignore warning log
import warnings
warnings.filterwarnings('ignore')

import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

import paddle
from ppdet.core.workspace import load_config, merge_config

from ppdet.engine import Trainer, TrainerCot, init_parallel_env, set_random_seed, init_fleet_env
from ppdet.engine.trainer_ssod import Trainer_DenseTeacher, Trainer_ARSL, Trainer_Semi_RTDETR

from ppdet.slim import build_slim_model

from ppdet.utils.cli import ArgsParser, merge_args
import ppdet.utils.check as check
from ppdet.utils.logger import setup_logger
logger = setup_logger('train')


def parse_args():
    parser = ArgsParser()
    parser.add_argument(
        "--eval",
        action='store_true',
        default=False,
        help="Whether to perform evaluation in train")
    parser.add_argument(
        "-r", "--resume", default=None, help="weights path for resume")
    parser.add_argument(
        "--slim_config",
        default=None,
        type=str,
        help="Configuration file of slim method.")
    parser.add_argument(
        "--enable_ce",
        type=bool,
        default=False,
        help="If set True, enable continuous evaluation job."
        "This flag is only used for internal test.")
    parser.add_argument(
        "--amp",
        action='store_true',
        default=False,
        help="Enable auto mixed precision training.")
    parser.add_argument(
        "--fleet", action='store_true', default=False, help="Use fleet or not")
    parser.add_argument(
        "--use_vdl",
        type=bool,
        default=False,
        help="whether to record the data to VisualDL.")
    parser.add_argument(
        '--vdl_log_dir',
        type=str,
        default="vdl_log_dir/scalar",
        help='VisualDL logging directory for scalar.')
    parser.add_argument(
        "--use_wandb",
        type=bool,
        default=False,
        help="whether to record the data to wandb.")
    parser.add_argument(
        '--save_prediction_only',
        action='store_true',
        default=False,
        help='Whether to save the evaluation results only')
    parser.add_argument(
        '--profiler_options',
        type=str,
        default=None,
        help="The option of profiler, which should be in "
        "format \"key1=value1;key2=value2;key3=value3\"."
        "please see ppdet/utils/profiler.py for detail.")
    parser.add_argument(
        '--save_proposals',
        action='store_true',
        default=False,
        help='Whether to save the train proposals')
    parser.add_argument(
        '--proposals_path',
        type=str,
        default="sniper/proposals.json",
        help='Train proposals directory')
    parser.add_argument(
        "--to_static",
        action='store_true',
        default=False,
        help="Enable dy2st to train.")

    args = parser.parse_args()
    return args


def run(FLAGS, cfg):
    # init fleet environment
    if cfg.fleet:
        init_fleet_env(cfg.get('find_unused_parameters', False))
    else:
        # init parallel environment if nranks > 1
        init_parallel_env()

    if FLAGS.enable_ce:
        set_random_seed(0)

    # build trainer
    ssod_method = cfg.get('ssod_method', None)
    if ssod_method is not None:
        if ssod_method == 'DenseTeacher':
            trainer = Trainer_DenseTeacher(cfg, mode='train')
        elif ssod_method == 'ARSL':
            trainer = Trainer_ARSL(cfg, mode='train')
        elif ssod_method == 'Semi_RTDETR':
            trainer = Trainer_Semi_RTDETR(cfg, mode='train')
        else:
            raise ValueError(
                "Semi-Supervised Object Detection only no support this method.")
    elif cfg.get('use_cot', False):
        trainer = TrainerCot(cfg, mode='train')
    else:
        trainer = Trainer(cfg, mode='train')

    # load weights
    if FLAGS.resume is not None:
        trainer.resume_weights(FLAGS.resume)
    elif 'pretrain_student_weights' in cfg and 'pretrain_teacher_weights' in cfg \
            and cfg.pretrain_teacher_weights and cfg.pretrain_student_weights:
        trainer.load_semi_weights(cfg.pretrain_teacher_weights,
                                  cfg.pretrain_student_weights)
    elif 'pretrain_weights' in cfg and cfg.pretrain_weights:
        trainer.load_weights(cfg.pretrain_weights)

    # training
    trainer.train(FLAGS.eval)


def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_args(cfg, FLAGS)
    merge_config(FLAGS.opt)

    # disable npu in config by default
    if 'use_npu' not in cfg:
        cfg.use_npu = False

    # disable xpu in config by default
    if 'use_xpu' not in cfg:
        cfg.use_xpu = False

    if 'use_gpu' not in cfg:
        cfg.use_gpu = False

    # disable mlu in config by default
    if 'use_mlu' not in cfg:
        cfg.use_mlu = False

    if cfg.use_gpu:
        place = paddle.set_device('gpu')
    elif cfg.use_npu:
        place = paddle.set_device('npu')
    elif cfg.use_xpu:
        place = paddle.set_device('xpu')
    elif cfg.use_mlu:
        place = paddle.set_device('mlu')
    else:
        place = paddle.set_device('cpu')

    if FLAGS.slim_config:
        cfg = build_slim_model(cfg, FLAGS.slim_config)

    # FIXME: Temporarily solve the priority problem of FLAGS.opt
    merge_config(FLAGS.opt)
    check.check_config(cfg)
    check.check_gpu(cfg.use_gpu)
    check.check_npu(cfg.use_npu)
    check.check_xpu(cfg.use_xpu)
    check.check_mlu(cfg.use_mlu)
    check.check_version()

    run(FLAGS, cfg)


if __name__ == "__main__":
    main()


================================================
FILE: tools/x2coco.py
================================================
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import glob
import json
import os
import os.path as osp
import shutil
import xml.etree.ElementTree as ET

import numpy as np
import PIL.ImageDraw
from tqdm import tqdm
import cv2

label_to_num = {}
categories_list = []
labels_list = []


class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)


def images_labelme(data, num):
    image = {}
    image['height'] = data['imageHeight']
    image['width'] = data['imageWidth']
    image['id'] = num + 1
    if '\\' in data['imagePath']:
        image['file_name'] = data['imagePath'].split('\\')[-1]
    else:
        image['file_name'] = data['imagePath'].split('/')[-1]
    return image


def images_cityscape(data, num, img_file):
    image = {}
    image['height'] = data['imgHeight']
    image['width'] = data['imgWidth']
    image['id'] = num + 1
    image['file_name'] = img_file
    return image


def categories(label, labels_list):
    category = {}
    category['supercategory'] = 'component'
    category['id'] = len(labels_list) + 1
    category['name'] = label
    return category


def annotations_rectangle(points, label, image_num, object_num, label_to_num):
    annotation = {}
    seg_points = np.asarray(points).copy()
    seg_points[1, :] = np.asarray(points)[2, :]
    seg_points[2, :] = np.asarray(points)[1, :]
    annotation['segmentation'] = [list(seg_points.flatten())]
    annotation['iscrowd'] = 0
    annotation['image_id'] = image_num + 1
    annotation['bbox'] = list(
        map(float, [
            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
                1] - points[0][1]
        ]))
    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
    annotation['category_id'] = label_to_num[label]
    annotation['id'] = object_num + 1
    return annotation


def annotations_polygon(height, width, points, label, image_num, object_num,
                        label_to_num):
    annotation = {}
    annotation['segmentation'] = [list(np.asarray(points).flatten())]
    annotation['iscrowd'] = 0
    annotation['image_id'] = image_num + 1
    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))
    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
    annotation['category_id'] = label_to_num[label]
    annotation['id'] = object_num + 1
    return annotation


def get_bbox(height, width, points):
    polygons = points
    mask = np.zeros([height, width], dtype=np.uint8)
    mask = PIL.Image.fromarray(mask)
    xy = list(map(tuple, polygons))
    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)
    mask = np.array(mask, dtype=bool)
    index = np.argwhere(mask == 1)
    rows = index[:, 0]
    clos = index[:, 1]
    left_top_r = np.min(rows)
    left_top_c = np.min(clos)
    right_bottom_r = np.max(rows)
    right_bottom_c = np.max(clos)
    return [
        left_top_c, left_top_r, right_bottom_c - left_top_c,
        right_bottom_r - left_top_r
    ]


def deal_json(ds_type, img_path, json_path):
    data_coco = {}
    images_list = []
    annotations_list = []
    image_num = -1
    object_num = -1
    for img_file in os.listdir(img_path):
        img_label = os.path.splitext(img_file)[0]
        if img_file.split('.')[
                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:
            continue
        label_file = osp.join(json_path, img_label + '.json')
        print('Generating dataset from:', label_file)
        image_num = image_num + 1
        with open(label_file) as f:
            data = json.load(f)
            if ds_type == 'labelme':
                images_list.append(images_labelme(data, image_num))
            elif ds_type == 'cityscape':
                images_list.append(images_cityscape(data, image_num, img_file))
            if ds_type == 'labelme':
                for shapes in data['shapes']:
                    object_num = object_num + 1
                    label = shapes['label']
                    if label not in labels_list:
                        categories_list.append(categories(label, labels_list))
                        labels_list.append(label)
                        label_to_num[label] = len(labels_list)
                    p_type = shapes['shape_type']
                    if p_type == 'polygon':
                        points = shapes['points']
                        annotations_list.append(
                            annotations_polygon(data['imageHeight'], data[
                                'imageWidth'], points, label, image_num,
                                                object_num, label_to_num))

                    if p_type == 'rectangle':
                        (x1, y1), (x2, y2) = shapes['points']
                        x1, x2 = sorted([x1, x2])
                        y1, y2 = sorted([y1, y2])
                        points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]]
                        annotations_list.append(
                            annotations_rectangle(points, label, image_num,
                                                  object_num, label_to_num))
            elif ds_type == 'cityscape':
                for shapes in data['objects']:
                    object_num = object_num + 1
                    label = shapes['label']
                    if label not in labels_list:
                        categories_list.append(categories(label, labels_list))
                        labels_list.append(label)
                        label_to_num[label] = len(labels_list)
                    points = shapes['polygon']
                    annotations_list.append(
                        annotations_polygon(data['imgHeight'], data[
                            'imgWidth'], points, label, image_num, object_num,
                                            label_to_num))
    data_coco['images'] = images_list
    data_coco['categories'] = categories_list
    data_coco['annotations'] = annotations_list
    return data_coco


def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path):
    with open(labels_path, 'r') as f:
        labels_str = f.read().split()
    labels_ids = list(range(1, len(labels_str) + 1))

    with open(ann_ids_path, 'r') as f:
        ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()]

    ann_paths = []
    for aid in ann_ids:
        if aid.endswith('xml'):
            ann_path = os.path.join(ann_dir_path, aid)
        else:
            ann_path = os.path.join(ann_dir_path, aid + '.xml')
        ann_paths.append(ann_path)

    return dict(zip(labels_str, labels_ids)), ann_paths


def voc_get_image_info(annotation_root, im_id):
    filename = annotation_root.findtext('filename')
    assert filename is not None
    img_name = os.path.basename(filename)

    size = annotation_root.find('size')
    width = float(size.findtext('width'))
    height = float(size.findtext('height'))

    image_info = {
        'file_name': filename,
        'height': height,
        'width': width,
        'id': im_id
    }
    return image_info


def voc_get_coco_annotation(obj, label2id):
    label = obj.findtext('name')
    assert label in label2id, "label is not in label2id."
    category_id = label2id[label]
    bndbox = obj.find('bndbox')
    xmin = float(bndbox.findtext('xmin'))
    ymin = float(bndbox.findtext('ymin'))
    xmax = float(bndbox.findtext('xmax'))
    ymax = float(bndbox.findtext('ymax'))
    assert xmax > xmin and ymax > ymin, "Box size error."
    o_width = xmax - xmin
    o_height = ymax - ymin
    anno = {
        'area': o_width * o_height,
        'iscrowd': 0,
        'bbox': [xmin, ymin, o_width, o_height],
        'category_id': category_id,
        'ignore': 0,
    }
    return anno


def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file):
    output_json_dict = {
        "images": [],
        "type": "instances",
        "annotations": [],
        "categories": []
    }
    bnd_id = 1  # bounding box start id
    im_id = 0
    print('Start converting !')
    for a_path in tqdm(annotation_paths):
        # Read annotation xml
        ann_tree = ET.parse(a_path)
        ann_root = ann_tree.getroot()

        img_info = voc_get_image_info(ann_root, im_id)
        output_json_dict['images'].append(img_info)

        for obj in ann_root.findall('object'):
            ann = voc_get_coco_annotation(obj=obj, label2id=label2id)
            ann.update({'image_id': im_id, 'id': bnd_id})
            output_json_dict['annotations'].append(ann)
            bnd_id = bnd_id + 1
        im_id += 1

    for label, label_id in label2id.items():
        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}
        output_json_dict['categories'].append(category_info)
    output_file = os.path.join(output_dir, output_file)
    with open(output_file, 'w') as f:
        output_json = json.dumps(output_json_dict)
        f.write(output_json)


def widerface_to_cocojson(root_path):
    train_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_train_bbx_gt.txt")
    val_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_val_bbx_gt.txt")
    train_img_dir = os.path.join(root_path, "WIDER_train", "images")
    val_img_dir = os.path.join(root_path, "WIDER_val", "images")
    assert train_gt_txt
    assert val_gt_txt
    assert train_img_dir
    assert val_img_dir
    save_path = os.path.join(root_path, "widerface_train.json")
    widerface_convert(train_gt_txt, train_img_dir, save_path)
    print("Wider Face train dataset converts sucess, the json path: {}".format(save_path))
    save_path = os.path.join(root_path, "widerface_val.json")
    widerface_convert(val_gt_txt, val_img_dir, save_path)
    print("Wider Face val dataset converts sucess, the json path: {}".format(save_path))


def widerface_convert(gt_txt, img_dir, save_path):
    output_json_dict = {
        "images": [],
        "type": "instances",
        "annotations": [],
        "categories": [{'supercategory': 'none', 'id': 0, 'name': "human_face"}]
    }
    bnd_id = 1  # bounding box start id
    im_id = 0
    print('Start converting !')
    with open(gt_txt) as fd:
        lines = fd.readlines()

    i = 0
    while i < len(lines):
        image_name = lines[i].strip()
        bbox_num = int(lines[i + 1].strip())
        i += 2
        img_info = get_widerface_image_info(img_dir, image_name, im_id)
        if img_info:
            output_json_dict["images"].append(img_info)
            for j in range(i, i + bbox_num):
                anno = get_widerface_ann_info(lines[j])
                anno.update({'image_id': im_id, 'id': bnd_id})
                output_json_dict['annotations'].append(anno)
                bnd_id += 1
        else:
            print("The image dose not exist: {}".format(os.path.join(img_dir, image_name)))
        bbox_num = 1 if bbox_num == 0 else bbox_num
        i += bbox_num
        im_id += 1
    with open(save_path, 'w') as f:
        output_json = json.dumps(output_json_dict)
        f.write(output_json)


def get_widerface_image_info(img_root, img_relative_path, img_id):
    image_info = {}
    save_path = os.path.join(img_root, img_relative_path)
    if os.path.exists(save_path):
        img = cv2.imread(save_path)
        image_info["file_name"] = os.path.join(os.path.basename(
            os.path.dirname(img_root)), os.path.basename(img_root),
            img_relative_path)
        image_info["height"] = img.shape[0]
        image_info["width"] = img.shape[1]
        image_info["id"] = img_id
    return image_info


def get_widerface_ann_info(info):
    info = [int(x) for x in info.strip().split()]
    anno = {
        'area': info[2] * info[3],
        'iscrowd': 0,
        'bbox': [info[0], info[1], info[2], info[3]],
        'category_id': 0,
        'ignore': 0,
        'blur': info[4],
        'expression': info[5],
        'illumination': info[6],
        'invalid': info[7],
        'occlusion': info[8],
        'pose': info[9]
    }
    return anno


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset_type',
        help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`')
    parser.add_argument('--json_input_dir', help='input annotated directory')
    parser.add_argument('--image_input_dir', help='image directory')
    parser.add_argument(
        '--output_dir', help='output dataset directory', default='./')
    parser.add_argument(
        '--train_proportion',
        help='the proportion of train dataset',
        type=float,
        default=1.0)
    parser.add_argument(
        '--val_proportion',
        help='the proportion of validation dataset',
        type=float,
        default=0.0)
    parser.add_argument(
        '--test_proportion',
        help='the proportion of test dataset',
        type=float,
        default=0.0)
    parser.add_argument(
        '--voc_anno_dir',
        help='In Voc format dataset, path to annotation files directory.',
        type=str,
        default=None)
    parser.add_argument(
        '--voc_anno_list',
        help='In Voc format dataset, path to annotation files ids list.',
        type=str,
        default=None)
    parser.add_argument(
        '--voc_label_list',
        help='In Voc format dataset, path to label list. The content of each line is a category.',
        type=str,
        default=None)
    parser.add_argument(
        '--voc_out_name',
        type=str,
        default='voc.json',
        help='In Voc format dataset, path to output json file')
    parser.add_argument(
        '--widerface_root_dir',
        help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path',
        type=str,
        default=None)
    args = parser.parse_args()
    try:
        assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface']
    except AssertionError as e:
        print(
            'Now only support the voc, cityscape dataset and labelme dataset!!')
        os._exit(0)

    if args.dataset_type == 'voc':
        assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list
        label2id, ann_paths = voc_get_label_anno(
            args.voc_anno_dir, args.voc_anno_list, args.voc_label_list)
        voc_xmls_to_cocojson(
            annotation_paths=ann_paths,
            label2id=label2id,
            output_dir=args.output_dir,
            output_file=args.voc_out_name)
    elif args.dataset_type == "widerface":
        assert args.widerface_root_dir
        widerface_to_cocojson(args.widerface_root_dir)
    else:
        try:
            assert os.path.exists(args.json_input_dir)
        except AssertionError as e:
            print('The json folder does not exist!')
            os._exit(0)
        try:
            assert os.path.exists(args.image_input_dir)
        except AssertionError as e:
            print('The image folder does not exist!')
            os._exit(0)
        try:
            assert abs(args.train_proportion + args.val_proportion \
                    + args.test_proportion - 1.0) < 1e-5
        except AssertionError as e:
            print(
                'The sum of pqoportion of training, validation and test datase must be 1!'
            )
            os._exit(0)

        # Allocate the dataset.
        total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))
        if args.train_proportion != 0:
            train_num = int(total_num * args.train_proportion)
            out_dir = args.output_dir + '/train'
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
        else:
            train_num = 0
        if args.val_proportion == 0.0:
            val_num = 0
            test_num = total_num - train_num
            out_dir = args.output_dir + '/test'
            if args.test_proportion != 0.0 and not os.path.exists(out_dir):
                os.makedirs(out_dir)
        else:
            val_num = int(total_num * args.val_proportion)
            test_num = total_num - train_num - val_num
            val_out_dir = args.output_dir + '/val'
            if not os.path.exists(val_out_dir):
                os.makedirs(val_out_dir)
            test_out_dir = args.output_dir + '/test'
            if args.test_proportion != 0.0 and not os.path.exists(test_out_dir):
                os.makedirs(test_out_dir)
        count = 1
        for img_name in os.listdir(args.image_input_dir):
            if count <= train_num:
                if osp.exists(args.output_dir + '/train/'):
                    shutil.copyfile(
                        osp.join(args.image_input_dir, img_name),
                        osp.join(args.output_dir + '/train/', img_name))
            else:
                if count <= train_num + val_num:
                    if osp.exists(args.output_dir + '/val/'):
                        shutil.copyfile(
                            osp.join(args.image_input_dir, img_name),
                            osp.join(args.output_dir + '/val/', img_name))
                else:
                    if osp.exists(args.output_dir + '/test/'):
                        shutil.copyfile(
                            osp.join(args.image_input_dir, img_name),
                            osp.join(args.output_dir + '/test/', img_name))
            count = count + 1

        # Deal with the json files.
        if not os.path.exists(args.output_dir + '/annotations'):
            os.makedirs(args.output_dir + '/annotations')
        if args.train_proportion != 0:
            train_data_coco = deal_json(args.dataset_type,
                                        args.output_dir + '/train',
                                        args.json_input_dir)
            train_json_path = osp.join(args.output_dir + '/annotations',
                                       'instance_train.json')
            json.dump(
                train_data_coco,
                open(train_json_path, 'w'),
                indent=4,
                cls=MyEncoder)
        if args.val_proportion != 0:
            val_data_coco = deal_json(args.dataset_type,
                                      args.output_dir + '/val',
                                      args.json_input_dir)
            val_json_path = osp.join(args.output_dir + '/annotations',
                                     'instance_val.json')
            json.dump(
                val_data_coco,
                open(val_json_path, 'w'),
                indent=4,
                cls=MyEncoder)
        if args.test_proportion != 0:
            test_data_coco = deal_json(args.dataset_type,
                                       args.output_dir + '/test',
                                       args.json_input_dir)
            test_json_path = osp.join(args.output_dir + '/annotations',
                                      'instance_test.json')
            json.dump(
                test_data_coco,
                open(test_json_path, 'w'),
                indent=4,
                cls=MyEncoder)


if __name__ == '__main__':
    main()